Exemplo n.º 1
0
def get_eidars_response_text(routing_service_url):
    """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this
    latter case eidavalidator is None)
    """
    # IMPORTANT NOTE:
    # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'.
    # The reason is that as of Jan 2019 the
    # service is buggy if supplying some arguments
    # (e.g., with long list of channels)
    # Also, this way we can save a local file (independent from the custom query)
    # and read from that file in case of request failure.
    # The drawback is that we might ask later some data centers for data they do not have:
    # This is an information the the routing service would provide us
    # if queried with all parameters (net, sta, start, etcetera) ... too bad
    query_args = {'service': 'dataselect', 'format': 'post'}
    url = urljoin(routing_service_url, **query_args)

    try:
        responsetext, status, msg = urlread(url,
                                            decode='utf8',
                                            raise_http_err=True)
        if not responsetext:
            raise URLException(Exception("Empty data response"))  # fall below
    except URLException as urlexc:
        responsetext, last_mod_time_str = _get_local_routing_service()
        msg = ("Eida routing service error, reading routes from file "
               "(last updated: %s)" % last_mod_time_str)
        logger.info(formatmsg(msg, "eida routing service error"))
        logger.warning(formatmsg("Eida routing service error", urlexc.exc,
                                 url))

    return responsetext
Exemplo n.º 2
0
def save_stations_and_channels(session, channels_df, eidavalidator, update,
                               db_bufsize):
    """Saves to db channels (and their stations) and returns a dataframe with only channels saved
    The returned data frame will have the column 'id' (`Station.id`) renamed to
    'station_id' (`Channel.station_id`) and a new 'id' column referring to the Channel id
    (`Channel.id`)
    :param channels_df: pandas DataFrame resulting from `get_channels_df`
    """
    # first drop channels of same station:
    sta_df = channels_df.drop_duplicates(
        subset=[ST.NET, ST.STA, ST.STIME, ST.DCID]).copy()
    sta_df = drop_station_duplicates(session, sta_df, eidavalidator)

    # remember: dbsyncdf raises a FailedDownload, so no need to check for empty(dataframe). Also,
    # if update is True, for stations only it must NOT update inventories HERE (handled later)
    _update_stations = update
    if _update_stations:
        _update_stations = [
            _ for _ in shared_colnames(Station, sta_df, pkey=False)
            if _ != Station.inventory_xml.key
        ]
    sta_df = dbsyncdf(sta_df,
                      session,
                      [Station.network, Station.station, Station.start_time],
                      Station.id,
                      _update_stations,
                      buf_size=db_bufsize,
                      keep_duplicates=True,
                      cols_to_print_on_err=ST.ERRCOLS)
    # sta_df will have the STA_ID columns, channels_df not: set it from the former to the latter:
    channels_df = mergeupdate(channels_df, sta_df,
                              [ST.NET, ST.STA, ST.STIME, ST.DCID], [ST.ID])
    # rename now 'id' to 'station_id' before writing the channels to db:
    channels_df.rename(columns={ST.ID: CH.STAID}, inplace=True)
    # check dupes and warn:
    channels_df_dupes = channels_df[channels_df[CH.STAID].isnull()]
    if not channels_df_dupes.empty:
        exc_msg = (
            "Found %d duplicated channel(s) to be discarded "
            "as a result of duplicated stations") % len(channels_df_dupes)
        logger.info(exc_msg)
        # If you want to print the duplicated channels, see `drop_station_duplicates`
        # (don't do it as it's redundant info), and type e.g.:
        # DbExcLogger(columns_to_print).failed_insert(channels_df_dupes, Exception(exc_msg))
        channels_df.dropna(axis=0, subset=[CH.STAID], inplace=True)

    # add channels to db:
    channels_df = dbsyncdf(
        channels_df,
        session, [Channel.station_id, Channel.location, Channel.channel],
        Channel.id,
        update,
        buf_size=db_bufsize,
        keep_duplicates=True,
        cols_to_print_on_err=CH.ERRCOLS)
    return channels_df
Exemplo n.º 3
0
 def dolog(ok, notok, okstr, nookstr):
     if not ok and not notok:
         return
     _errmsg = "sql errors"
     _noerrmsg = "no sql error"
     msg = okstr % (ok, "row" if ok == 1 else "rows")
     infomsg = _noerrmsg
     if notok:
         msg += nookstr % notok
         infomsg = _errmsg
     logger.info(formatmsg("%s: %s" % (_header, msg), infomsg))
Exemplo n.º 4
0
def drop_station_duplicates(session, sta_df, eidavalidator):
    '''Drops station duplicates from the Station Data frame `sta_df`
    using eidavalidator or the database accessible via
    the session object, if eidavalidator is None.
    If no duplicates are found, returns `sta_df`
    '''
    # then check dupes. Same network, station, starttime but different datacenter:
    duplicated = sta_df.duplicated(
        subset=[ST.NET, ST.STA, ST.STIME],
        keep=False)  # keep=False => Mark all duplicates as True
    if duplicated.any():
        sta_df_dupes = sta_df[duplicated].copy()
        sta_df_dupes.rename(columns={ST.DCID: ST.DCID2}, inplace=True)
        sta_df_dupes[ST.DCID] = np.nan

        if eidavalidator is not None:
            for i, net, sta, loc, cha, stime, etime in \
                zip(sta_df_dupes.index, sta_df_dupes[ST.NET], sta_df_dupes[ST.STA],
                    sta_df_dupes[CH.LOC], sta_df_dupes[CH.CHA],
                    sta_df_dupes[ST.STIME], sta_df_dupes[ST.ETIME]):
                sta_df_dupes.at[i, ST.DCID] = \
                    eidavalidator.get_dc_id(net, sta, loc, cha,
                                            None if pd.isnull(stime) else stime,
                                            None if pd.isnull(etime) else etime)
        else:
            sta_db = dbquery2df(
                session.query(Station.network, Station.station,
                              Station.start_time, Station.datacenter_id))
            mergeupdate(sta_df_dupes, sta_db, [ST.NET, ST.STA, ST.STIME],
                        [ST.DCID])

        sta_df_dupes = sta_df_dupes[
            sta_df_dupes[ST.DCID] != sta_df_dupes[ST.DCID2]]

        if not sta_df_dupes.empty:
            exc_msg = "Found %d duplicated station(s) to be discarded (checked against %s)" % \
                (len(sta_df_dupes),
                 ("already saved stations" if eidavalidator is None else "eida routing service"))
            logger.info(exc_msg)
            # print the removed dataframe to log.warning (showing
            # [STA_NET, STA_STA, STA_STIME, STA_DCID2] columns only):
            db_exc_logger = DbExcLogger([ST.NET, ST.STA, ST.STIME, ST.DCID2])
            db_exc_logger.failed_insert(
                sta_df_dupes.sort_values(by=[ST.NET, ST.STA, ST.STIME]),
                '',
            )
            # https://stackoverflow.com/questions/28901683/pandas-get-rows-which-are-not-in-other-dataframe:
            sta_df = sta_df.loc[~sta_df.index.isin(sta_df_dupes.index)]

    return sta_df
Exemplo n.º 5
0
    def __init__(self, datacenters_df, authorizer, show_progress=False):
        '''initializes a new DcDataselectManager'''
        DC_ID = DataCenter.id.key  # pylint: disable=invalid-name
        DC_DSURL = DataCenter.dataselect_url.key  # pylint: disable=invalid-name

        # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict,
        # but we want iterrows cause we convert any dc url to its fdsnws object
        dcid2fdsn = {
            int(row[DC_ID]): Fdsnws(row[DC_DSURL])
            for _, row in datacenters_df.iterrows()
        }
        # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well
        # formed
        errors = {}  # urls mapped to their exception
        if authorizer.token:
            token = authorizer.token
            self._data, errors = self._get_data_from_token(
                dcid2fdsn, token, show_progress)
            self._restricted_id = [
                did for did in self._data if did not in errors
            ]
        elif authorizer.userpass:
            user, password = authorizer.userpass
            self._data, errors = self._get_data_from_userpass(
                dcid2fdsn, user, password)
            self._restricted_id = list(dcid2fdsn.keys())
        else:  # no authorization required
            self._data, errors = self._get_data_open(dcid2fdsn)
            self._restricted_id = []

        if errors:
            # map urls site to error, not dcids:
            errors = {
                dcid2fdsn[dcid].site: err
                for dcid, err in errors.items()
            }
            logger.info(
                formatmsg(
                    'Downloading open data only from: %s' % ", ".join(errors),
                    'Unable to acquire credentials for restricted data'))
            for url, exc in errors.items():
                logger.warning(
                    formatmsg(
                        "Downloading open data only, "
                        "Unable to acquire credentials for restricted data",
                        str(exc), url))
Exemplo n.º 6
0
def check_suspiciously_duplicated_segment(segments_df):
    '''Checks for suspiciously duplicated segments, i.e. different ids
    but same (channel_id, request_start, request_end). These segments stem from distinct
    events with very close spatio-temporal coordinates.
    This function simply logs a message if any such duplicated segment is found,
    it does NOT modify segments_df
    '''
    seg_dupes_mask = segments_df.duplicated(
        subset=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME], keep=False)
    if seg_dupes_mask.any():
        seg_dupes = segments_df[seg_dupes_mask]
        logger.info(
            formatmsg(
                "%d suspiciously duplicated segments found: this is most likely\n"
                "due to events with different ids\n"
                "but same (or very close) latitude, longitude, depth and time."
            ), len(seg_dupes))
        logwarn_dataframe(
            seg_dupes.sort_values(by=[SEG.CHAID, SEG.REQSTIME, SEG.REQETIME]),
            "Suspicious duplicated segments",
            [SEG.CHAID, SEG.REQSTIME, SEG.REQETIME, SEG.EVID],
            max_row_count=100)
Exemplo n.º 7
0
def dblog(table, inserted, not_inserted, updated=0, not_updated=0):
    """Prints to log the result of a database wrtie operation.
    Use this function to harmonize the message format and make it more readable in log or terminal
    """
    _header = "Db table '%s'" % table.__tablename__
    if not inserted and not not_inserted and not updated and not not_updated:
        logger.info("%s: no new row to insert, no row to update", _header)
    else:

        def dolog(ok, notok, okstr, nookstr):
            if not ok and not notok:
                return
            _errmsg = "sql errors"
            _noerrmsg = "no sql error"
            msg = okstr % (ok, "row" if ok == 1 else "rows")
            infomsg = _noerrmsg
            if notok:
                msg += nookstr % notok
                infomsg = _errmsg
            logger.info(formatmsg("%s: %s" % (_header, msg), infomsg))

        dolog(inserted, not_inserted, "%d new %s inserted", ", %d discarded")
        dolog(updated, not_updated, "%d %s updated", ", %d discarded")
Exemplo n.º 8
0
def merge_events_stations(events_df, channels_df, search_radius,
                          tttable, show_progress=False):
    """Merges `events_df` and `channels_df` by returning a new dataframe representing all
    channels within a specific search radius. *Each row of the returned data frame is
    basically a segment to be potentially donwloaded*.
    The returned dataframe will be the same as `channels_df` with one or more rows repeated
    (some channels might be in the search radius of several events), plus a column
    "event_id" (`Segment.event_id`) representing the event associated to that channel
    and two columns 'event_distance_deg', 'time' (representing the *event* time) and
    'depth_km' (representing the event depth in km)

    :param channels_df: pandas DataFrame resulting from `get_channels_df`
    :param events_df: pandas DataFrame resulting from `get_events_df`
    """
    # For convenience and readability, define once the mapped column names representing the
    # dataframe columns that we need:
    EVT_ID = Event.id.key  # pylint: disable=invalid-name
    EVT_MAG = Event.magnitude.key  # pylint: disable=invalid-name
    EVT_LAT = Event.latitude.key  # pylint: disable=invalid-name
    EVT_LON = Event.longitude.key  # pylint: disable=invalid-name
    EVT_TIME = Event.time.key  # pylint: disable=invalid-name
    EVT_DEPTH = Event.depth_km.key  # pylint: disable=invalid-name
    STA_LAT = Station.latitude.key  # pylint: disable=invalid-name
    STA_LON = Station.longitude.key  # pylint: disable=invalid-name
    STA_STIME = Station.start_time.key  # pylint: disable=invalid-name
    STA_ETIME = Station.end_time.key  # pylint: disable=invalid-name
    CHA_ID = Channel.id.key  # pylint: disable=invalid-name
    CHA_STAID = Channel.station_id.key  # pylint: disable=invalid-name
    SEG_EVID = Segment.event_id.key  # pylint: disable=invalid-name
    SEG_EVDIST = Segment.event_distance_deg.key  # pylint: disable=invalid-name
    SEG_ATIME = Segment.arrival_time.key  # pylint: disable=invalid-name
    SEG_DCID = Segment.datacenter_id.key  # pylint: disable=invalid-name
    SEG_CHAID = Segment.channel_id.key  # pylint: disable=invalid-name

    channels_df = channels_df.rename(columns={CHA_ID: SEG_CHAID})
    # get unique stations, rename Channel.id into Segment.channel_id now so we do not bother later
    stations_df = channels_df.drop_duplicates(subset=[CHA_STAID]).copy()

    ret = []

    sourcedepths, eventtimes = [], []

    with get_progressbar(show_progress, length=len(events_df)) as pbar:
        min_radia, max_radia = get_serarch_radia(search_radius, events_df[EVT_MAG].values)
        for min_radius, max_radius, evt_dic in \
                zip(min_radia, max_radia, dfrowiter(events_df, [EVT_ID, EVT_LAT, EVT_LON,
                                                                EVT_TIME, EVT_DEPTH])):
            l2d = locations2degrees(stations_df[STA_LAT], stations_df[STA_LON],
                                    evt_dic[EVT_LAT], evt_dic[EVT_LON])
            condition = (stations_df[STA_STIME] <= evt_dic[EVT_TIME]) & \
                        (pd.isnull(stations_df[STA_ETIME]) |
                         (stations_df[STA_ETIME] >= evt_dic[EVT_TIME] + timedelta(days=1)))
            # l2d is a distance, thus non negative. We can add the min radius condition
            # only if it is >=0. Evaluate to false in case min_radius is None (legacy code):
            if min_radius:
                condition &= (l2d >= min_radius)
            # for max_radius, None means: skip
            if max_radius is not None:
                condition &= (l2d <= max_radius)

            pbar.update(1)
            if not np.any(condition):
                continue

            # Set (or re-set from second iteration on) as NaN SEG_EVDIST columns. This is important
            # cause from second loop on we might have some elements not-NaN which should be NaN now
            channels_df[SEG_EVDIST] = np.nan
            # set locations2 degrees
            stations_df[SEG_EVDIST] = l2d
            # Copy distances calculated on stations to their channels
            # (match along column CHA_STAID shared between the reletive dataframes). Set values
            # only for channels whose stations are within radius (stations_df[condition]):
            cha_df = mergeupdate(channels_df, stations_df[condition], [CHA_STAID], [SEG_EVDIST],
                                 drop_other_df_duplicates=False)  # dupes already dropped
            # drop channels which are not related to station within radius:
            cha_df = cha_df.dropna(subset=[SEG_EVDIST], inplace=False).copy()
            cha_df[SEG_EVID] = evt_dic[EVT_ID]  # ...and add "safely" SEG_EVID values
            # append to arrays (calculate arrival times in one shot a t the end, it's faster):
            sourcedepths += [evt_dic[EVT_DEPTH]] * len(cha_df)
            eventtimes += [np.datetime64(evt_dic[EVT_TIME])] * len(cha_df)
            # Append only relevant columns:
            ret.append(cha_df[[SEG_CHAID, SEG_EVID, SEG_DCID, SEG_EVDIST]])

    # create total segments dataframe:
    # first check we have data:
    if not ret:
        raise FailedDownload(formatmsg("No segments to process",
                                       "No station within search radia"))
    # now concat:
    ret = pd.concat(ret, axis=0, ignore_index=True, copy=True)
    # compute travel times. Doing it on a single array is much faster
    sourcedepths = np.array(sourcedepths)
    distances = ret[SEG_EVDIST].values
    traveltimes = tttable(sourcedepths, 0, distances)
    # assign to column:
    eventtimes = np.array(eventtimes)  # should be of type  '<M8[us]' or whatever datetime dtype
    # now to compute arrival times: eventtimes + traveltimes does not work (we cannot
    # sum np.datetime64 and np.float). Convert traveltimes to np.timedelta: we first multiply by
    # 1000000 to preserve the millisecond resolution and then we write traveltimes.astype("m8[us]")
    # which means: 8bytes timedelta with microsecond resolution (10^-6)
    # Side note: that all numpy timedelta constructors (as well as "astype") round to int
    # argument, at least in numpy13.
    ret[SEG_ATIME] = eventtimes + (traveltimes*1000000).astype("m8[us]")
    # drop nat values
    oldlen = len(ret)
    ret.dropna(subset=[SEG_ATIME], inplace=True)
    if oldlen > len(ret):
        logger.info(formatmsg("%d of %d segments discarded", "Travel times NaN"),
                    oldlen-len(ret), oldlen)
        if ret.empty:
            raise FailedDownload(formatmsg("No segments to process", "All travel times NaN"))
    return ret
Exemplo n.º 9
0
def get_channels_df(
        session,
        datacenters_df,
        eidavalidator,  # <- can be none
        net,
        sta,
        loc,
        cha,
        starttime,
        endtime,
        min_sample_rate,
        update,
        max_thread_workers,
        timeout,
        blocksize,
        db_bufsize,
        show_progress=False):
    """Returns a dataframe representing a query to the eida services (or the internal db
    if `post_data` is None) with the given argument.  The
    dataframe will have as columns the `key` attribute of any of the following db columns:
    ```
    [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id]
    ```
    :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame)
    :param post_data: the second item resulting from `get_datacenters_df` (string)
    :param channels: a list of string denoting the channels, or None for no filtering
        (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument
        is not used if `post_data` is given (not None)
    :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering
        (all channels)
    """
    postdata = get_post_data(net, sta, loc, cha, starttime, endtime)

    ret = []
    url_failed_dc_ids = []
    iterable = ((id_,
                 Request(url,
                         data=('format=text\nlevel=channel\n' +
                               post_data_str).encode('utf8')))
                for url, id_, post_data_str in zip(
                    datacenters_df[DataCenter.station_url.key], datacenters_df[
                        DataCenter.id.key], cycle([postdata])))

    with get_progressbar(show_progress, length=len(datacenters_df)) as pbar:
        for obj, result, exc, url in read_async(iterable,
                                                urlkey=lambda obj: obj[-1],
                                                blocksize=blocksize,
                                                max_workers=max_thread_workers,
                                                decode='utf8',
                                                timeout=timeout):
            pbar.update(1)
            dcen_id = obj[0]
            if exc:
                url_failed_dc_ids.append(dcen_id)
                logger.warning(formatmsg("Unable to fetch stations", exc, url))
            else:
                try:
                    dframe = response2normalizeddf(url, result[0], "channel")
                    if not dframe.empty:
                        dframe[Station.datacenter_id.key] = dcen_id
                        ret.append(dframe)
                except ValueError as verr:
                    logger.warning(
                        formatmsg("Discarding response data", verr, url))

    db_cha_df = pd.DataFrame()
    if url_failed_dc_ids:  # if some datacenter does not return station, warn with INFO
        dc_df_fromdb = \
            datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)]
        logger.info(
            formatmsg(
                "Fetching stations from database for %d (of %d) data-center(s)",
                "download errors occurred"), len(dc_df_fromdb),
            len(datacenters_df))
        logger.info(
            dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False))
        db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta,
                                            loc, cha, starttime, endtime,
                                            min_sample_rate)

    # build two dataframes which we will concatenate afterwards
    web_cha_df = pd.DataFrame()
    if ret:  # pd.concat complains for empty list
        try:
            web_cha_df = filter_channels_df(
                pd.concat(ret, axis=0, ignore_index=True, copy=False), net,
                sta, loc, cha, min_sample_rate)

            # this raises FailedDownload if we cannot save any element:
            web_cha_df = save_stations_and_channels(session, web_cha_df,
                                                    eidavalidator, update,
                                                    db_bufsize)
        except FailedDownload as qexc:
            if db_cha_df.empty:
                raise
            else:
                logger.warning(qexc)

    if db_cha_df.empty and web_cha_df.empty:
        # ok, now let's see if we have remaining datacenters to be fetched from the db
        raise FailedDownload(
            formatmsg("No station found",
                      ("Unable to fetch stations from all data-centers, "
                       "no data to fetch from the database. "
                       "Check config and log for details")))
    ret = None
    if db_cha_df.empty:
        ret = web_cha_df
    elif web_cha_df.empty:
        ret = db_cha_df
    else:
        ret = pd.concat((web_cha_df, db_cha_df),
                        axis=0,
                        ignore_index=True,
                        sort=False)
    # the columns for the channels dataframe that will be returned
    return ret[[
        c.key for c in (Channel.id, Channel.station_id, Station.latitude,
                        Station.longitude, Station.datacenter_id,
                        Station.start_time, Station.end_time, Station.network,
                        Station.station, Channel.location, Channel.channel)
    ]].copy()
Exemplo n.º 10
0
def prepare_for_download(session,
                         segments_df,
                         dc_dataselect_manager,
                         timespan,
                         retry_seg_not_found,
                         retry_url_err,
                         retry_mseed_err,
                         retry_client_err,
                         retry_server_err,
                         retry_timespan_err,
                         retry_timespan_warn=False):
    """Drops the segments which are already present on the database and updates the primary
    keys for those not present (adding them to the db). Adds new columns to the returned
    Data frame

    :param session: the sql-alchemy session bound to an existing database
    :param segments_df: pandas DataFrame resulting from `get_arrivaltimes`
    """
    opendataonly = dc_dataselect_manager.opendataonly
    # fetch  already downloaded segments and return the corresponding dataframe.
    # which will have also the boolean column SEG.RETRY, which is True for suspiciously
    # restricted (SR) segments, i.e. segments whose download code MIGHT denote that they
    # are restricted (see `s2scodes.restricted_data`):
    db_seg_df = fetch_already_downloaded_segments_df(session, segments_df,
                                                     opendataonly)
    # store now the ids of the SR segments, we will use them later. If open data, `db_seg_df`
    # does not have the column SEG.RETRY so set the ids to a (empty) DataFrame for consistency:
    force_retry_ids = pd.DataFrame() if opendataonly else db_seg_df[SEG.ID][
        db_seg_df[SEG.RETRY]]
    # Now update the SEG.RETRY column (or create it) according to the flags set:
    set_segments_to_retry(db_seg_df, opendataonly, retry_seg_not_found,
                          retry_url_err, retry_mseed_err, retry_client_err,
                          retry_server_err, retry_timespan_err,
                          retry_timespan_warn)

    # Now merge/update existing dataframe (`segments_df`) with the db values (`db_seg_df`).
    # Do it in two steps, 1) and 2):
    # 1) set columns and defaults (for int types, sets np.nan).
    # Note that if we have something to retry (db_seg_df[SEG_RETRY].any()), we add also
    # a column SEG.DSCODE with None/nan as default: checking if that column exists
    # will be the way later to know if we need to update rows or only insert new rows.
    cols2set = OrderedDict([(SEG.ID, np.nan), (SEG.RETRY, True),
                            (SEG.REQSTIME, pd.NaT), (SEG.REQETIME, pd.NaT)] +
                           ([(SEG.DSCODE,
                              np.nan)] if db_seg_df[SEG.RETRY].any() else []))
    for colname, default_ in cols2set.items():
        segments_df[colname] = default_
    # 2) assign/override values of cols2set from db_seg_df to segments_df,
    # matching rows via the [SEG_CHID, SEG_EVID] cols:
    segments_df = mergeupdate(segments_df, db_seg_df, [SEG.CHAID, SEG.EVID],
                              list(cols2set.keys()))

    request_timebounds_need_update = set_requested_timebounds(
        segments_df, timespan)

    oldlen = len(segments_df)
    # do a copy to avoid SettingWithCopyWarning. Moreover, copy should re-allocate contiguous
    # arrays which might be faster (and less memory consuming after unused memory is released)
    segments_df = segments_df[segments_df[SEG.RETRY]].copy()
    if oldlen != len(segments_df):
        reason = "already downloaded, no retry"
        logger.info(formatmsg("%d segments discarded", reason),
                    oldlen - len(segments_df))

    if segments_df.empty:
        raise NothingToDownload(
            "Nothing to download: all segments already downloaded "
            "according to the current configuration")

    check_suspiciously_duplicated_segment(segments_df)

    # Last step: the policy later will be to UPDATE (=overwrite existing segments on the database)
    # only segments whose download code changed (see comment on line 354)  because yes, it might
    # save a lot of time. E.g., suppose retry_server_error=true and a segment
    # on the db with download code=500 => update it only if the server returns some code != 500.
    # However, if we are downloading with credentials, we need to force updating SR segments which
    # were downloaded with no credentials, by definition of SR (suspiciously restricted).
    # Thus, if we have those segments (`not force_retry_ids.empty`) and we are
    # performing a download on an already existing database (`SEG.DSCODE in segments_df.columns`),
    # for those SR segments we will set the value of the column `SEG.DSCODE` to None/nan:
    # as we will never get any response code = None from the server, those SR segments
    # will always be updated
    if not force_retry_ids.empty and SEG.DSCODE in segments_df.columns:
        segments_df.loc[segments_df[SEG.ID].isin(force_retry_ids),
                        SEG.DSCODE] = np.nan

    segments_df.drop([SEG.RETRY], axis=1, inplace=True)

    return segments_df, request_timebounds_need_update