Пример #1
0
def get_eidars_response_text(routing_service_url):
    """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this
    latter case eidavalidator is None)
    """
    # IMPORTANT NOTE:
    # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'.
    # The reason is that as of Jan 2019 the
    # service is buggy if supplying some arguments
    # (e.g., with long list of channels)
    # Also, this way we can save a local file (independent from the custom query)
    # and read from that file in case of request failure.
    # The drawback is that we might ask later some data centers for data they do not have:
    # This is an information the the routing service would provide us
    # if queried with all parameters (net, sta, start, etcetera) ... too bad
    query_args = {'service': 'dataselect', 'format': 'post'}
    url = urljoin(routing_service_url, **query_args)

    try:
        responsetext, status, msg = urlread(url,
                                            decode='utf8',
                                            raise_http_err=True)
        if not responsetext:
            raise URLException(Exception("Empty data response"))  # fall below
    except URLException as urlexc:
        responsetext, last_mod_time_str = _get_local_routing_service()
        msg = ("Eida routing service error, reading routes from file "
               "(last updated: %s)" % last_mod_time_str)
        logger.info(formatmsg(msg, "eida routing service error"))
        logger.warning(formatmsg("Eida routing service error", urlexc.exc,
                                 url))

    return responsetext
Пример #2
0
    def __init__(self, datacenters_df, authorizer, show_progress=False):
        '''initializes a new DcDataselectManager'''
        DC_ID = DataCenter.id.key  # pylint: disable=invalid-name
        DC_DSURL = DataCenter.dataselect_url.key  # pylint: disable=invalid-name

        # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict,
        # but we want iterrows cause we convert any dc url to its fdsnws object
        dcid2fdsn = {
            int(row[DC_ID]): Fdsnws(row[DC_DSURL])
            for _, row in datacenters_df.iterrows()
        }
        # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well
        # formed
        errors = {}  # urls mapped to their exception
        if authorizer.token:
            token = authorizer.token
            self._data, errors = self._get_data_from_token(
                dcid2fdsn, token, show_progress)
            self._restricted_id = [
                did for did in self._data if did not in errors
            ]
        elif authorizer.userpass:
            user, password = authorizer.userpass
            self._data, errors = self._get_data_from_userpass(
                dcid2fdsn, user, password)
            self._restricted_id = list(dcid2fdsn.keys())
        else:  # no authorization required
            self._data, errors = self._get_data_open(dcid2fdsn)
            self._restricted_id = []

        if errors:
            # map urls site to error, not dcids:
            errors = {
                dcid2fdsn[dcid].site: err
                for dcid, err in errors.items()
            }
            logger.info(
                formatmsg(
                    'Downloading open data only from: %s' % ", ".join(errors),
                    'Unable to acquire credentials for restricted data'))
            for url, exc in errors.items():
                logger.warning(
                    formatmsg(
                        "Downloading open data only, "
                        "Unable to acquire credentials for restricted data",
                        str(exc), url))
Пример #3
0
def test_formatmsg():
    req = Request('http://mysite/query', data='a'*1000)
    msg = formatmsg("action", "errmsg", req)
    expected = ("action (errmsg). url: http://mysite/query, POST data:\n%s\n"
                "...(showing first 200 characters only)") % ('a' * 200)
    assert msg == expected

    req = Request('http://mysite/query', data='a\n'*5)
    msg = formatmsg("action", "errmsg", req)
    expected = ("action (errmsg). url: http://mysite/query, POST data:\n%s") % ('a\n' * 5)
    assert msg == expected.strip()

    req = Request('http://mysite/query', data=b'a\n'*5)
    msg = formatmsg("action", "errmsg", req)
    expected = ("action (errmsg). url: http://mysite/query, POST data:\n"
                "b'a\\na\\na\\na\\na\\n'")
    assert msg == expected.strip()

    
Пример #4
0
    def warn(self, request, exc):
        '''issues a logger.warn if the given error is not already reported

        :param request: the Request object
        :pram exc: the reported Exception or string message
        '''
        url = get_host(request)
        item = (url, err2str(exc))  # use err2str to uniquely identify exc
        if item not in self:
            if not self:
                logger.warning(
                    'Detailed inventory download errors '
                    '(showing only first of each type per data center):')
            self.add(item)
            request_str = url2str(request)
            logger.warning(
                formatmsg("Inventory download error", exc, request_str))
Пример #5
0
    def warn(self, request, url, code, exc):
        '''issues a logger.warn if the given error is not already reported

        :param request: the Request object
        :param url: string, usually the request's url host, to identify same data centers
        :param code: the error code
        :pram exc: the reported Exception
        '''
        item = (url, code, str(exc.__class__.__name__))
        if item not in self:
            if not self:
                logger.warning(
                    'Detailed segment download errors '
                    '(showing only first of each type per data center):')
            self.add(item)
            request_str = url2str(request)
            logger.warning(
                formatmsg("Segment download error, code %s" % str(code), exc,
                          request_str))
Пример #6
0
def check_suspiciously_duplicated_segment(segments_df):
    '''Checks for suspiciously duplicated segments, i.e. different ids
    but same (channel_id, request_start, request_end). These segments stem from distinct
    events with very close spatio-temporal coordinates.
    This function simply logs a message if any such duplicated segment is found,
    it does NOT modify segments_df
    '''
    seg_dupes_mask = segments_df.duplicated(
        subset=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME], keep=False)
    if seg_dupes_mask.any():
        seg_dupes = segments_df[seg_dupes_mask]
        logger.info(
            formatmsg(
                "%d suspiciously duplicated segments found: this is most likely\n"
                "due to events with different ids\n"
                "but same (or very close) latitude, longitude, depth and time."
            ), len(seg_dupes))
        logwarn_dataframe(
            seg_dupes.sort_values(by=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME]),
            "Suspicious duplicated segments",
            [SEG.CHAID, SEG.REQSTIME, SEG.REQETIME, SEG.EVID],
            max_row_count=100)
Пример #7
0
def get_datacenters_df(session,
                       service,
                       routing_service_url,
                       network,
                       station,
                       location,
                       channel,
                       starttime=None,
                       endtime=None,
                       db_bufsize=None):
    """Returns a 2 elements tuple: the dataframe of the datacenter(s) matching `service`,
    and an EidaValidator (built on the eida routing service response)
    for checking stations/channels duplicates after querying the datacenter(s)
    for stations / channels. If service != 'eida', this argument is None

    WARNING: Due to bugs in the eida rs the parameter
    network, station, location, channel, starttime, endtime
    are NOT used and are here for legacy code and potential future development once
    the eida rs will be fixed. In cany case, they would be used only if service = 'eida'

    :param service: the string denoting the dataselect *or* station url in fdsn format, or
        'eida', or 'iris'. In case of 'eida', `routing_service_url` must denote an url for the
        edia routing service. If falsy (e.g., empty string or None), `service` defaults to 'eida'
    """

    # For convenience and readability, define once the mapped column names representing the
    # dataframe columns that we need:
    DC_SURL = DataCenter.station_url.key  # pylint: disable=invalid-name
    DC_DURL = DataCenter.dataselect_url.key  # pylint: disable=invalid-name
    DC_ORG = DataCenter.organization_name.key  # pylint: disable=invalid-name

    eidars_response_text = None

    if not service:
        service = 'eida'

    if service.lower() == 'iris':
        iris_netloc = 'https://service.iris.edu'
        dc_df = pd.DataFrame(data={
            DC_DURL: '%s/fdsnws/dataselect/1/query' % iris_netloc,
            DC_SURL: '%s/fdsnws/station/1/query' % iris_netloc,
            DC_ORG: 'iris'
        },
                             index=[0])
    elif service.lower() != 'eida':
        try:
            fdsn = Fdsnws(service)
            dc_df = pd.DataFrame(data={
                DC_DURL: fdsn.url(Fdsnws.DATASEL),
                DC_SURL: fdsn.url(Fdsnws.STATION),
                DC_ORG: None
            },
                                 index=[0])
        except ValueError:
            raise FailedDownload(
                formatmsg("Unable to use datacenter",
                          "Url does not seem to be a valid fdsn url", service))
    else:
        eidars_response_text = get_eidars_response_text(routing_service_url)
        dc_df = get_eida_datacenters_df(eidars_response_text)

    # attempt saving to db only if we might have something to save:
    dc_df = dbsyncdf(dc_df,
                     session, [DataCenter.station_url],
                     DataCenter.id,
                     buf_size=len(dc_df) if db_bufsize is None else db_bufsize,
                     keep_duplicates='first')

    return dc_df, \
        EidaValidator(dc_df, eidars_response_text) if eidars_response_text is not None else None
Пример #8
0
def merge_events_stations(events_df, channels_df, search_radius,
                          tttable, show_progress=False):
    """Merges `events_df` and `channels_df` by returning a new dataframe representing all
    channels within a specific search radius. *Each row of the returned data frame is
    basically a segment to be potentially donwloaded*.
    The returned dataframe will be the same as `channels_df` with one or more rows repeated
    (some channels might be in the search radius of several events), plus a column
    "event_id" (`Segment.event_id`) representing the event associated to that channel
    and two columns 'event_distance_deg', 'time' (representing the *event* time) and
    'depth_km' (representing the event depth in km)

    :param channels_df: pandas DataFrame resulting from `get_channels_df`
    :param events_df: pandas DataFrame resulting from `get_events_df`
    """
    # For convenience and readability, define once the mapped column names representing the
    # dataframe columns that we need:
    EVT_ID = Event.id.key  # pylint: disable=invalid-name
    EVT_MAG = Event.magnitude.key  # pylint: disable=invalid-name
    EVT_LAT = Event.latitude.key  # pylint: disable=invalid-name
    EVT_LON = Event.longitude.key  # pylint: disable=invalid-name
    EVT_TIME = Event.time.key  # pylint: disable=invalid-name
    EVT_DEPTH = Event.depth_km.key  # pylint: disable=invalid-name
    STA_LAT = Station.latitude.key  # pylint: disable=invalid-name
    STA_LON = Station.longitude.key  # pylint: disable=invalid-name
    STA_STIME = Station.start_time.key  # pylint: disable=invalid-name
    STA_ETIME = Station.end_time.key  # pylint: disable=invalid-name
    CHA_ID = Channel.id.key  # pylint: disable=invalid-name
    CHA_STAID = Channel.station_id.key  # pylint: disable=invalid-name
    SEG_EVID = Segment.event_id.key  # pylint: disable=invalid-name
    SEG_EVDIST = Segment.event_distance_deg.key  # pylint: disable=invalid-name
    SEG_ATIME = Segment.arrival_time.key  # pylint: disable=invalid-name
    SEG_DCID = Segment.datacenter_id.key  # pylint: disable=invalid-name
    SEG_CHAID = Segment.channel_id.key  # pylint: disable=invalid-name

    channels_df = channels_df.rename(columns={CHA_ID: SEG_CHAID})
    # get unique stations, rename Channel.id into Segment.channel_id now so we do not bother later
    stations_df = channels_df.drop_duplicates(subset=[CHA_STAID]).copy()

    ret = []

    sourcedepths, eventtimes = [], []

    with get_progressbar(show_progress, length=len(events_df)) as pbar:
        min_radia, max_radia = get_serarch_radia(search_radius, events_df[EVT_MAG].values)
        for min_radius, max_radius, evt_dic in \
                zip(min_radia, max_radia, dfrowiter(events_df, [EVT_ID, EVT_LAT, EVT_LON,
                                                                EVT_TIME, EVT_DEPTH])):
            l2d = locations2degrees(stations_df[STA_LAT], stations_df[STA_LON],
                                    evt_dic[EVT_LAT], evt_dic[EVT_LON])
            condition = (stations_df[STA_STIME] <= evt_dic[EVT_TIME]) & \
                        (pd.isnull(stations_df[STA_ETIME]) |
                         (stations_df[STA_ETIME] >= evt_dic[EVT_TIME] + timedelta(days=1)))
            # l2d is a distance, thus non negative. We can add the min radius condition
            # only if it is >=0. Evaluate to false in case min_radius is None (legacy code):
            if min_radius:
                condition &= (l2d >= min_radius)
            # for max_radius, None means: skip
            if max_radius is not None:
                condition &= (l2d <= max_radius)

            pbar.update(1)
            if not np.any(condition):
                continue

            # Set (or re-set from second iteration on) as NaN SEG_EVDIST columns. This is important
            # cause from second loop on we might have some elements not-NaN which should be NaN now
            channels_df[SEG_EVDIST] = np.nan
            # set locations2 degrees
            stations_df[SEG_EVDIST] = l2d
            # Copy distances calculated on stations to their channels
            # (match along column CHA_STAID shared between the reletive dataframes). Set values
            # only for channels whose stations are within radius (stations_df[condition]):
            cha_df = mergeupdate(channels_df, stations_df[condition], [CHA_STAID], [SEG_EVDIST],
                                 drop_other_df_duplicates=False)  # dupes already dropped
            # drop channels which are not related to station within radius:
            cha_df = cha_df.dropna(subset=[SEG_EVDIST], inplace=False).copy()
            cha_df[SEG_EVID] = evt_dic[EVT_ID]  # ...and add "safely" SEG_EVID values
            # append to arrays (calculate arrival times in one shot a t the end, it's faster):
            sourcedepths += [evt_dic[EVT_DEPTH]] * len(cha_df)
            eventtimes += [np.datetime64(evt_dic[EVT_TIME])] * len(cha_df)
            # Append only relevant columns:
            ret.append(cha_df[[SEG_CHAID, SEG_EVID, SEG_DCID, SEG_EVDIST]])

    # create total segments dataframe:
    # first check we have data:
    if not ret:
        raise FailedDownload(formatmsg("No segments to process",
                                       "No station within search radia"))
    # now concat:
    ret = pd.concat(ret, axis=0, ignore_index=True, copy=True)
    # compute travel times. Doing it on a single array is much faster
    sourcedepths = np.array(sourcedepths)
    distances = ret[SEG_EVDIST].values
    traveltimes = tttable(sourcedepths, 0, distances)
    # assign to column:
    eventtimes = np.array(eventtimes)  # should be of type  '<M8[us]' or whatever datetime dtype
    # now to compute arrival times: eventtimes + traveltimes does not work (we cannot
    # sum np.datetime64 and np.float). Convert traveltimes to np.timedelta: we first multiply by
    # 1000000 to preserve the millisecond resolution and then we write traveltimes.astype("m8[us]")
    # which means: 8bytes timedelta with microsecond resolution (10^-6)
    # Side note: that all numpy timedelta constructors (as well as "astype") round to int
    # argument, at least in numpy13.
    ret[SEG_ATIME] = eventtimes + (traveltimes*1000000).astype("m8[us]")
    # drop nat values
    oldlen = len(ret)
    ret.dropna(subset=[SEG_ATIME], inplace=True)
    if oldlen > len(ret):
        logger.info(formatmsg("%d of %d segments discarded", "Travel times NaN"),
                    oldlen-len(ret), oldlen)
        if ret.empty:
            raise FailedDownload(formatmsg("No segments to process", "All travel times NaN"))
    return ret
Пример #9
0
def get_channels_df(
        session,
        datacenters_df,
        eidavalidator,  # <- can be none
        net,
        sta,
        loc,
        cha,
        starttime,
        endtime,
        min_sample_rate,
        update,
        max_thread_workers,
        timeout,
        blocksize,
        db_bufsize,
        show_progress=False):
    """Returns a dataframe representing a query to the eida services (or the internal db
    if `post_data` is None) with the given argument.  The
    dataframe will have as columns the `key` attribute of any of the following db columns:
    ```
    [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id]
    ```
    :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame)
    :param post_data: the second item resulting from `get_datacenters_df` (string)
    :param channels: a list of string denoting the channels, or None for no filtering
        (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument
        is not used if `post_data` is given (not None)
    :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering
        (all channels)
    """
    postdata = get_post_data(net, sta, loc, cha, starttime, endtime)

    ret = []
    url_failed_dc_ids = []
    iterable = ((id_,
                 Request(url,
                         data=('format=text\nlevel=channel\n' +
                               post_data_str).encode('utf8')))
                for url, id_, post_data_str in zip(
                    datacenters_df[DataCenter.station_url.key], datacenters_df[
                        DataCenter.id.key], cycle([postdata])))

    with get_progressbar(show_progress, length=len(datacenters_df)) as pbar:
        for obj, result, exc, url in read_async(iterable,
                                                urlkey=lambda obj: obj[-1],
                                                blocksize=blocksize,
                                                max_workers=max_thread_workers,
                                                decode='utf8',
                                                timeout=timeout):
            pbar.update(1)
            dcen_id = obj[0]
            if exc:
                url_failed_dc_ids.append(dcen_id)
                logger.warning(formatmsg("Unable to fetch stations", exc, url))
            else:
                try:
                    dframe = response2normalizeddf(url, result[0], "channel")
                    if not dframe.empty:
                        dframe[Station.datacenter_id.key] = dcen_id
                        ret.append(dframe)
                except ValueError as verr:
                    logger.warning(
                        formatmsg("Discarding response data", verr, url))

    db_cha_df = pd.DataFrame()
    if url_failed_dc_ids:  # if some datacenter does not return station, warn with INFO
        dc_df_fromdb = \
            datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)]
        logger.info(
            formatmsg(
                "Fetching stations from database for %d (of %d) data-center(s)",
                "download errors occurred"), len(dc_df_fromdb),
            len(datacenters_df))
        logger.info(
            dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False))
        db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta,
                                            loc, cha, starttime, endtime,
                                            min_sample_rate)

    # build two dataframes which we will concatenate afterwards
    web_cha_df = pd.DataFrame()
    if ret:  # pd.concat complains for empty list
        try:
            web_cha_df = filter_channels_df(
                pd.concat(ret, axis=0, ignore_index=True, copy=False), net,
                sta, loc, cha, min_sample_rate)

            # this raises FailedDownload if we cannot save any element:
            web_cha_df = save_stations_and_channels(session, web_cha_df,
                                                    eidavalidator, update,
                                                    db_bufsize)
        except FailedDownload as qexc:
            if db_cha_df.empty:
                raise
            else:
                logger.warning(qexc)

    if db_cha_df.empty and web_cha_df.empty:
        # ok, now let's see if we have remaining datacenters to be fetched from the db
        raise FailedDownload(
            formatmsg("No station found",
                      ("Unable to fetch stations from all data-centers, "
                       "no data to fetch from the database. "
                       "Check config and log for details")))
    ret = None
    if db_cha_df.empty:
        ret = web_cha_df
    elif web_cha_df.empty:
        ret = db_cha_df
    else:
        ret = pd.concat((web_cha_df, db_cha_df),
                        axis=0,
                        ignore_index=True,
                        sort=False)
    # the columns for the channels dataframe that will be returned
    return ret[[
        c.key for c in (Channel.id, Channel.station_id, Station.latitude,
                        Station.longitude, Station.datacenter_id,
                        Station.start_time, Station.end_time, Station.network,
                        Station.station, Channel.location, Channel.channel)
    ]].copy()
Пример #10
0
def prepare_for_download(session,
                         segments_df,
                         dc_dataselect_manager,
                         timespan,
                         retry_seg_not_found,
                         retry_url_err,
                         retry_mseed_err,
                         retry_client_err,
                         retry_server_err,
                         retry_timespan_err,
                         retry_timespan_warn=False):
    """Drops the segments which are already present on the database and updates the primary
    keys for those not present (adding them to the db). Adds new columns to the returned
    Data frame

    :param session: the sql-alchemy session bound to an existing database
    :param segments_df: pandas DataFrame resulting from `get_arrivaltimes`
    """
    opendataonly = dc_dataselect_manager.opendataonly
    # fetch  already downloaded segments and return the corresponding dataframe.
    # which will have also the boolean column SEG.RETRY, which is True for suspiciously
    # restricted (SR) segments, i.e. segments whose download code MIGHT denote that they
    # are restricted (see `s2scodes.restricted_data`):
    db_seg_df = fetch_already_downloaded_segments_df(session, segments_df,
                                                     opendataonly)
    # store now the ids of the SR segments, we will use them later. If open data, `db_seg_df`
    # does not have the column SEG.RETRY so set the ids to a (empty) DataFrame for consistency:
    force_retry_ids = pd.DataFrame() if opendataonly else db_seg_df[SEG.ID][
        db_seg_df[SEG.RETRY]]
    # Now update the SEG.RETRY column (or create it) according to the flags set:
    set_segments_to_retry(db_seg_df, opendataonly, retry_seg_not_found,
                          retry_url_err, retry_mseed_err, retry_client_err,
                          retry_server_err, retry_timespan_err,
                          retry_timespan_warn)

    # Now merge/update existing dataframe (`segments_df`) with the db values (`db_seg_df`).
    # Do it in two steps, 1) and 2):
    # 1) set columns and defaults (for int types, sets np.nan).
    # Note that if we have something to retry (db_seg_df[SEG_RETRY].any()), we add also
    # a column SEG.DSCODE with None/nan as default: checking if that column exists
    # will be the way later to know if we need to update rows or only insert new rows.
    cols2set = OrderedDict([(SEG.ID, np.nan), (SEG.RETRY, True),
                            (SEG.REQSTIME, pd.NaT), (SEG.REQETIME, pd.NaT)] +
                           ([(SEG.DSCODE,
                              np.nan)] if db_seg_df[SEG.RETRY].any() else []))
    for colname, default_ in cols2set.items():
        segments_df[colname] = default_
    # 2) assign/override values of cols2set from db_seg_df to segments_df,
    # matching rows via the [SEG_CHID, SEG_EVID] cols:
    segments_df = mergeupdate(segments_df, db_seg_df, [SEG.CHAID, SEG.EVID],
                              list(cols2set.keys()))

    request_timebounds_need_update = set_requested_timebounds(
        segments_df, timespan)

    oldlen = len(segments_df)
    # do a copy to avoid SettingWithCopyWarning. Moreover, copy should re-allocate contiguous
    # arrays which might be faster (and less memory consuming after unused memory is released)
    segments_df = segments_df[segments_df[SEG.RETRY]].copy()
    if oldlen != len(segments_df):
        reason = "already downloaded, no retry"
        logger.info(formatmsg("%d segments discarded", reason),
                    oldlen - len(segments_df))

    if segments_df.empty:
        raise NothingToDownload(
            "Nothing to download: all segments already downloaded "
            "according to the current configuration")

    check_suspiciously_duplicated_segment(segments_df)

    # Last step: the policy later will be to UPDATE (=overwrite existing segments on the database)
    # only segments whose download code changed (see comment on line 354)  because yes, it might
    # save a lot of time. E.g., suppose retry_server_error=true and a segment
    # on the db with download code=500 => update it only if the server returns some code != 500.
    # However, if we are downloading with credentials, we need to force updating SR segments which
    # were downloaded with no credentials, by definition of SR (suspiciously restricted).
    # Thus, if we have those segments (`not force_retry_ids.empty`) and we are
    # performing a download on an already existing database (`SEG.DSCODE in segments_df.columns`),
    # for those SR segments we will set the value of the column `SEG.DSCODE` to None/nan:
    # as we will never get any response code = None from the server, those SR segments
    # will always be updated
    if not force_retry_ids.empty and SEG.DSCODE in segments_df.columns:
        segments_df.loc[segments_df[SEG.ID].isin(force_retry_ids),
                        SEG.DSCODE] = np.nan

    segments_df.drop([SEG.RETRY], axis=1, inplace=True)

    return segments_df, request_timebounds_need_update
Пример #11
0
def download_save_segments(session,
                           segments_df,
                           dc_dataselect_manager,
                           chaid2mseedid,
                           download_id,
                           update_datacenters,
                           update_request_timebounds,
                           max_thread_workers,
                           timeout,
                           download_blocksize,
                           db_bufsize,
                           show_progress=False):
    """Downloads and saves the segments. segments_df MUST not be empty (this is not checked for)

    :param segments_df: the dataframe resulting from `prepare_for_download`. The Dataframe
        might or might not have the column 'download_code'. If it has, it will skip
        writing to db segments whose code did not change: in this case, nans stored under
        'download_code' in segments_df indicate new segments, or segments for which the update
        has to be forced, whatever code is obtained (e.g., queryauth when previously a simple
        query was used)
    :param chaid2mseedid: dict of channel ids (int) mapped to mseed ids
        (strings in "Network.station.location.channel" format)
    """
    # set queryauth column here, outside the loop:
    restricted_enable_dcids = dc_dataselect_manager.restricted_enabled_ids
    if restricted_enable_dcids:
        segments_df[SEG.QAUTH] = \
            segments_df[SEG.DCID].isin(dc_dataselect_manager.restricted_enabled_ids)
    else:
        segments_df[SEG.QAUTH] = False

    segmanager = get_dbmanager(session, update_datacenters,
                               update_request_timebounds, db_bufsize)
    stats = DownloadStats()

    # define the groupsby columns
    # remember that segments_df has columns:
    # we should group by (net, sta, loc, stime, etime), meaning that two rows with those values
    # equal will be given in the same sub-dataframe, and if 413 is found, take 413s erros creating a
    # new dataframe, and then group segment by segment, i.e.
    # (net, sta, loc, cha, stime, etime).
    # Unfortunately, for perf reasons we do not have
    # the first 4 columns, but we do have channel_id which basically comprises (net, sta, loc, cha)
    # NOTE: SEG_START and SEG_END MUST BE ALWAYS PRESENT IN THE SECOND AND THORD POSITION!!!!!
    groupsby = [[SEG.DCID, SEG.START, SEG.END],
                [SEG.DCID, SEG.START, SEG.END, SEG.CHAID]]

    # these are the column names to be set on a dataframe from a received response,
    # mapped to their default value
    # Set nan to let pandas understand it's numeric. None I don't know how it is converted
    # (should be checked) but it's for string types
    # for numpy types, see
    # https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html#specifying-and-constructing-data-types
    defaultvalues = {
        SEG.DATA: None,
        SEG.SRATE: np.nan,
        SEG.MGAP: np.nan,
        SEG.DATAID: None,
        SEG.DSCODE: np.nan,
        SEG.STIME: pd.NaT,
        SEG.ETIME: pd.NaT
    }
    defaultvalues[SEG.DOWNLID] = download_id
    defaultvalues_nodata = dict(defaultvalues)  # copy
    col_dscode, col_data = SEG.DSCODE, SEG.DATA
    toupdate = SEG.DSCODE in segments_df.columns
    code_not_found = s2scodes.seg_not_found
    skipped_same_code = 0
    seg_logger = SegmentLogger(
    )  # report seg. errors only once per error type and data center
    with get_progressbar(show_progress, length=len(segments_df)) as pbar:

        skipped_dataframes = [
        ]  # store dataframes with a 413 error and retry later
        for group_ in groupsby:

            if segments_df.empty:  # for safety (if this is the second loop or greater)
                break

            is_last_iteration = group_ == groupsby[-1]
            seg_groups = segments_df.groupby(group_, sort=False)
            for data, exc, code, request, dframe in \
                    get_responses(seg_groups, dc_dataselect_manager, chaid2mseedid,
                                  max_thread_workers, timeout, download_blocksize):

                num_segments = len(dframe)
                if code == 413 and not is_last_iteration and num_segments > 1:
                    skipped_dataframes.append(dframe)
                    continue

                pbar.update(num_segments)
                url = get_host(request)
                url_stats = stats[url]

                if exc is None and data != b'':
                    # set default values on the dataframe:
                    dframe = dframe.assign(
                        **defaultvalues)  # assign returns a copy
                    populate_dataframe(data, code, dframe, chaid2mseedid)
                    # group by download code, count them, and add the counts to stats:
                    for kode, kount in get_counts(dframe, col_dscode,
                                                  code_not_found):
                        url_stats[kode] += kount
                else:
                    # here we are if: exc is not None OR data = b''
                    url_stats[code] += num_segments
                    if toupdate and code is not None and (dframe[col_dscode]
                                                          == code).sum():
                        # if there are rows to update, then discard those for which
                        # the code is the same in the database. If we requested a different
                        # time window, we should update the time windows but there is no point in
                        # this overhead. The condition `code is not None`
                        # should never happen but for safety we put it, because we have set the
                        # download code column of `dframe` to None/nan to mark segments to update
                        # neverthless, on the assumption that we never get response code = None
                        # (see comment L.94).
                        # Thus, if for some weird reason the response code is None, then update the
                        # segment anyway (as we wanted to)
                        dframe = dframe[dframe[col_dscode] != code]
                        skipped_same_code += num_segments - len(dframe)
                        if dframe.empty:  # nothing to update on the db
                            continue
                    # update dict of default values, and set it to the dataframe:
                    defaultvalues_nodata.update({
                        col_dscode: code,
                        col_data: data
                    })
                    dframe = dframe.assign(
                        **defaultvalues_nodata)  # assign returns a copy

                    if exc is not None:
                        # log segment errors only once per error type and data center,
                        # otherwise the log is hundreds of Mb and it's unreadable:
                        seg_logger.warn(request, url, code, exc)

                segmanager.add(dframe)

            segmanager.flush(
            )  # flush remaining stuff to insert / update, if any

            if skipped_dataframes:
                segments_df = pd.concat(skipped_dataframes,
                                        axis=0,
                                        ignore_index=True,
                                        copy=True,
                                        verify_integrity=False)
                skipped_dataframes = []
            else:
                # break the next loop, if any
                segments_df = pd.DataFrame()

    segmanager.close()  # flush remaining stuff to insert / update

    if skipped_same_code:
        logger.warning(
            formatmsg(
                ("%d already saved segment(s) with no waveform data skipped "
                 "with no messages, only their count is reported "
                 "in statistics") % skipped_same_code,
                "Still receiving the same download code"))
    return stats