def get_eida_datacenters_df(responsetext): """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this latter case eidavalidator is None) """ # For convenience and readability, define once the mapped column names representing the # dataframe columns that we need: DC_SURL = DataCenter.station_url.key # pylint: disable=invalid-name DC_DURL = DataCenter.dataselect_url.key # pylint: disable=invalid-name DC_ORG = DataCenter.organization_name.key # pylint: disable=invalid-name dclist = [] for url, postdata in eidarsiter(responsetext): # @UnusedVariable try: fdsn = Fdsnws(url) dclist.append({ DC_SURL: fdsn.url(Fdsnws.STATION), DC_DURL: fdsn.url(Fdsnws.DATASEL), DC_ORG: 'eida' }) except ValueError as verr: logger.warning( "Discarding data center (non FDSN url: '%s' " "as returned from the routing service)", url) if not dclist: raise FailedDownload( Exception("No datacenters found in response text / file")) datacenters_df = pd.DataFrame(dclist) return datacenters_df
def get_eidars_response_text(routing_service_url): """Returns the tuple (datacenters_df, eidavalidator) from eidars or from the db (in this latter case eidavalidator is None) """ # IMPORTANT NOTE: # We issue a "basic" query to the EIDA rs, with no params other than 'service' and 'format'. # The reason is that as of Jan 2019 the # service is buggy if supplying some arguments # (e.g., with long list of channels) # Also, this way we can save a local file (independent from the custom query) # and read from that file in case of request failure. # The drawback is that we might ask later some data centers for data they do not have: # This is an information the the routing service would provide us # if queried with all parameters (net, sta, start, etcetera) ... too bad query_args = {'service': 'dataselect', 'format': 'post'} url = urljoin(routing_service_url, **query_args) try: responsetext, status, msg = urlread(url, decode='utf8', raise_http_err=True) if not responsetext: raise URLException(Exception("Empty data response")) # fall below except URLException as urlexc: responsetext, last_mod_time_str = _get_local_routing_service() msg = ("Eida routing service error, reading routes from file " "(last updated: %s)" % last_mod_time_str) logger.info(formatmsg(msg, "eida routing service error")) logger.warning(formatmsg("Eida routing service error", urlexc.exc, url)) return responsetext
def logwarn_dataframe(dataframe, msg, cols_to_print_on_err, max_row_count=30): '''prints (using log.warning) the current dataframe. Does not check if dataframe is empty''' len_df = len(dataframe) if len_df > max_row_count: footer = "\n... (showing first %d rows only)" % max_row_count dataframe = dataframe.iloc[:max_row_count] else: footer = "" msg = "{}:\n{}{}".format( msg, dataframe.to_string(columns=cols_to_print_on_err, index=False), footer) logger.warning(msg)
def warn(self, request, exc): '''issues a logger.warn if the given error is not already reported :param request: the Request object :pram exc: the reported Exception or string message ''' url = get_host(request) item = (url, err2str(exc)) # use err2str to uniquely identify exc if item not in self: if not self: logger.warning( 'Detailed inventory download errors ' '(showing only first of each type per data center):') self.add(item) request_str = url2str(request) logger.warning( formatmsg("Inventory download error", exc, request_str))
def __init__(self, datacenters_df, authorizer, show_progress=False): '''initializes a new DcDataselectManager''' DC_ID = DataCenter.id.key # pylint: disable=invalid-name DC_DSURL = DataCenter.dataselect_url.key # pylint: disable=invalid-name # there is a handy function datacenters_df.set_index(keys_col)[values_col].to_dict, # but we want iterrows cause we convert any dc url to its fdsnws object dcid2fdsn = { int(row[DC_ID]): Fdsnws(row[DC_DSURL]) for _, row in datacenters_df.iterrows() } # Note: Fdsnws might raise, but at this point datacenters_df is assumed to be well # formed errors = {} # urls mapped to their exception if authorizer.token: token = authorizer.token self._data, errors = self._get_data_from_token( dcid2fdsn, token, show_progress) self._restricted_id = [ did for did in self._data if did not in errors ] elif authorizer.userpass: user, password = authorizer.userpass self._data, errors = self._get_data_from_userpass( dcid2fdsn, user, password) self._restricted_id = list(dcid2fdsn.keys()) else: # no authorization required self._data, errors = self._get_data_open(dcid2fdsn) self._restricted_id = [] if errors: # map urls site to error, not dcids: errors = { dcid2fdsn[dcid].site: err for dcid, err in errors.items() } logger.info( formatmsg( 'Downloading open data only from: %s' % ", ".join(errors), 'Unable to acquire credentials for restricted data')) for url, exc in errors.items(): logger.warning( formatmsg( "Downloading open data only, " "Unable to acquire credentials for restricted data", str(exc), url))
def response2normalizeddf(url, raw_data, dbmodel_key): """Returns a normalized and harmonized dataframe from raw_data. dbmodel_key can be 'event' 'station' or 'channel'. Raises ValueError if the resulting dataframe is empty or if a ValueError is raised from sub-functions :param url: url (string) or `Request` object. Used only to log the specified url in case of wranings :param raw_data: valid FDSN data in text format. For info see: https://www.fdsn.org/webservices/FDSN-WS-Specifications-1.1.pdf#page=12 """ dframe = response2df(raw_data) oldlen, dframe = len(dframe), normalize_fdsn_dframe(dframe, dbmodel_key) # stations_df surely not empty: if oldlen > len(dframe): logger.warning( formatmsg("%d row(s) discarded", "malformed text data", url), oldlen - len(dframe)) return dframe
def warn(self, request, url, code, exc): '''issues a logger.warn if the given error is not already reported :param request: the Request object :param url: string, usually the request's url host, to identify same data centers :param code: the error code :pram exc: the reported Exception ''' item = (url, code, str(exc.__class__.__name__)) if item not in self: if not self: logger.warning( 'Detailed segment download errors ' '(showing only first of each type per data center):') self.add(item) request_str = url2str(request) logger.warning( formatmsg("Segment download error, code %s" % str(code), exc, request_str))
def get_channels_df( session, datacenters_df, eidavalidator, # <- can be none net, sta, loc, cha, starttime, endtime, min_sample_rate, update, max_thread_workers, timeout, blocksize, db_bufsize, show_progress=False): """Returns a dataframe representing a query to the eida services (or the internal db if `post_data` is None) with the given argument. The dataframe will have as columns the `key` attribute of any of the following db columns: ``` [Channel.id, Station.latitude, Station.longitude, Station.datacenter_id] ``` :param datacenters_df: the first item resulting from `get_datacenters_df` (pandas DataFrame) :param post_data: the second item resulting from `get_datacenters_df` (string) :param channels: a list of string denoting the channels, or None for no filtering (all channels). Each string follows FDSN specifications (e.g. 'BHZ', 'H??'). This argument is not used if `post_data` is given (not None) :param min_sample_rate: minimum sampling rate, set to negative value for no-filtering (all channels) """ postdata = get_post_data(net, sta, loc, cha, starttime, endtime) ret = [] url_failed_dc_ids = [] iterable = ((id_, Request(url, data=('format=text\nlevel=channel\n' + post_data_str).encode('utf8'))) for url, id_, post_data_str in zip( datacenters_df[DataCenter.station_url.key], datacenters_df[ DataCenter.id.key], cycle([postdata]))) with get_progressbar(show_progress, length=len(datacenters_df)) as pbar: for obj, result, exc, url in read_async(iterable, urlkey=lambda obj: obj[-1], blocksize=blocksize, max_workers=max_thread_workers, decode='utf8', timeout=timeout): pbar.update(1) dcen_id = obj[0] if exc: url_failed_dc_ids.append(dcen_id) logger.warning(formatmsg("Unable to fetch stations", exc, url)) else: try: dframe = response2normalizeddf(url, result[0], "channel") if not dframe.empty: dframe[Station.datacenter_id.key] = dcen_id ret.append(dframe) except ValueError as verr: logger.warning( formatmsg("Discarding response data", verr, url)) db_cha_df = pd.DataFrame() if url_failed_dc_ids: # if some datacenter does not return station, warn with INFO dc_df_fromdb = \ datacenters_df.loc[datacenters_df[DataCenter.id.key].isin(url_failed_dc_ids)] logger.info( formatmsg( "Fetching stations from database for %d (of %d) data-center(s)", "download errors occurred"), len(dc_df_fromdb), len(datacenters_df)) logger.info( dc_df_fromdb[DataCenter.dataselect_url.key].to_string(index=False)) db_cha_df = get_channels_df_from_db(session, dc_df_fromdb, net, sta, loc, cha, starttime, endtime, min_sample_rate) # build two dataframes which we will concatenate afterwards web_cha_df = pd.DataFrame() if ret: # pd.concat complains for empty list try: web_cha_df = filter_channels_df( pd.concat(ret, axis=0, ignore_index=True, copy=False), net, sta, loc, cha, min_sample_rate) # this raises FailedDownload if we cannot save any element: web_cha_df = save_stations_and_channels(session, web_cha_df, eidavalidator, update, db_bufsize) except FailedDownload as qexc: if db_cha_df.empty: raise else: logger.warning(qexc) if db_cha_df.empty and web_cha_df.empty: # ok, now let's see if we have remaining datacenters to be fetched from the db raise FailedDownload( formatmsg("No station found", ("Unable to fetch stations from all data-centers, " "no data to fetch from the database. " "Check config and log for details"))) ret = None if db_cha_df.empty: ret = web_cha_df elif web_cha_df.empty: ret = db_cha_df else: ret = pd.concat((web_cha_df, db_cha_df), axis=0, ignore_index=True, sort=False) # the columns for the channels dataframe that will be returned return ret[[ c.key for c in (Channel.id, Channel.station_id, Station.latitude, Station.longitude, Station.datacenter_id, Station.start_time, Station.end_time, Station.network, Station.station, Channel.location, Channel.channel) ]].copy()
def filter_channels_df(channels_df, net, sta, loc, cha, min_sample_rate): '''Filters out `channels_df` according to the given parameters. Raises `FailedDownload` if the returned filtered data frame woul be empty Note that `net, sta, loc, cha` filters will be considered only if negations (i.e., with leading exclamation mark: "!A*") because the 'positive' filters are FDSN stantard and are supposed to be already used in producing channels_df Example: filter_channels_df(d, [], ['ABC'], [''], ['!A*', 'HH?', 'HN?']) basically takes the dataframe `d`, finds the column related to the `channels` key and removes all rowv whose channel starts with 'A', returning the new filtered data frame Arguments are usually the output of :func:`stream2segment.download.utils.nslc_lists` :param net: an iterable of strings denoting networks. :param sta: an iterable of strings denoting stations. :param loc: an iterable of strings denoting locations. :param cha: an iterable of strings denoting channels. :param min_sample_rate: numeric, minimum sample rate. If negative or zero, this parameter is ignored ''' # create a dict of regexps for pandas dataframe. FDSNWS do not support NOT # operators . Thus concatenate expression with OR dffilter = None sa_cols = (Station.network, Station.station, Channel.location, Channel.channel) for lst, sa_col in zip((net, sta, loc, cha), sa_cols): if not lst: continue lst = [_ for _ in lst if _[0:1] == '!'] # take only negation expression if not lst: continue condition = ("^%s$" if len(lst) == 1 else "^(?:%s)$") % \ "|".join(strconvert.wild2re(x[1:]) for x in lst) flt = channels_df[sa_col.key].str.match(re.compile(condition)) if dffilter is None: dffilter = flt else: dffilter &= flt if min_sample_rate > 0: # account for Nones, thus negate the predicate below: flt = ~(channels_df[Channel.sample_rate.key] >= min_sample_rate) if dffilter is None: dffilter = flt else: dffilter &= flt ret = channels_df if dffilter is None else \ channels_df[~dffilter].copy() # pylint: disable=invalid-unary-operand-type if ret.empty: raise FailedDownload("No channel matches user defined filters " "(network, channel, sample rate, ...)") discarded_sr = len(channels_df) - len(ret) if discarded_sr: logger.warning( ("%d channel(s) discarded according to current config. filters " "(network, channel, sample rate, ...)"), discarded_sr) return ret
def download_save_segments(session, segments_df, dc_dataselect_manager, chaid2mseedid, download_id, update_datacenters, update_request_timebounds, max_thread_workers, timeout, download_blocksize, db_bufsize, show_progress=False): """Downloads and saves the segments. segments_df MUST not be empty (this is not checked for) :param segments_df: the dataframe resulting from `prepare_for_download`. The Dataframe might or might not have the column 'download_code'. If it has, it will skip writing to db segments whose code did not change: in this case, nans stored under 'download_code' in segments_df indicate new segments, or segments for which the update has to be forced, whatever code is obtained (e.g., queryauth when previously a simple query was used) :param chaid2mseedid: dict of channel ids (int) mapped to mseed ids (strings in "Network.station.location.channel" format) """ # set queryauth column here, outside the loop: restricted_enable_dcids = dc_dataselect_manager.restricted_enabled_ids if restricted_enable_dcids: segments_df[SEG.QAUTH] = \ segments_df[SEG.DCID].isin(dc_dataselect_manager.restricted_enabled_ids) else: segments_df[SEG.QAUTH] = False segmanager = get_dbmanager(session, update_datacenters, update_request_timebounds, db_bufsize) stats = DownloadStats() # define the groupsby columns # remember that segments_df has columns: # we should group by (net, sta, loc, stime, etime), meaning that two rows with those values # equal will be given in the same sub-dataframe, and if 413 is found, take 413s erros creating a # new dataframe, and then group segment by segment, i.e. # (net, sta, loc, cha, stime, etime). # Unfortunately, for perf reasons we do not have # the first 4 columns, but we do have channel_id which basically comprises (net, sta, loc, cha) # NOTE: SEG_START and SEG_END MUST BE ALWAYS PRESENT IN THE SECOND AND THORD POSITION!!!!! groupsby = [[SEG.DCID, SEG.START, SEG.END], [SEG.DCID, SEG.START, SEG.END, SEG.CHAID]] # these are the column names to be set on a dataframe from a received response, # mapped to their default value # Set nan to let pandas understand it's numeric. None I don't know how it is converted # (should be checked) but it's for string types # for numpy types, see # https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html#specifying-and-constructing-data-types defaultvalues = { SEG.DATA: None, SEG.SRATE: np.nan, SEG.MGAP: np.nan, SEG.DATAID: None, SEG.DSCODE: np.nan, SEG.STIME: pd.NaT, SEG.ETIME: pd.NaT } defaultvalues[SEG.DOWNLID] = download_id defaultvalues_nodata = dict(defaultvalues) # copy col_dscode, col_data = SEG.DSCODE, SEG.DATA toupdate = SEG.DSCODE in segments_df.columns code_not_found = s2scodes.seg_not_found skipped_same_code = 0 seg_logger = SegmentLogger( ) # report seg. errors only once per error type and data center with get_progressbar(show_progress, length=len(segments_df)) as pbar: skipped_dataframes = [ ] # store dataframes with a 413 error and retry later for group_ in groupsby: if segments_df.empty: # for safety (if this is the second loop or greater) break is_last_iteration = group_ == groupsby[-1] seg_groups = segments_df.groupby(group_, sort=False) for data, exc, code, request, dframe in \ get_responses(seg_groups, dc_dataselect_manager, chaid2mseedid, max_thread_workers, timeout, download_blocksize): num_segments = len(dframe) if code == 413 and not is_last_iteration and num_segments > 1: skipped_dataframes.append(dframe) continue pbar.update(num_segments) url = get_host(request) url_stats = stats[url] if exc is None and data != b'': # set default values on the dataframe: dframe = dframe.assign( **defaultvalues) # assign returns a copy populate_dataframe(data, code, dframe, chaid2mseedid) # group by download code, count them, and add the counts to stats: for kode, kount in get_counts(dframe, col_dscode, code_not_found): url_stats[kode] += kount else: # here we are if: exc is not None OR data = b'' url_stats[code] += num_segments if toupdate and code is not None and (dframe[col_dscode] == code).sum(): # if there are rows to update, then discard those for which # the code is the same in the database. If we requested a different # time window, we should update the time windows but there is no point in # this overhead. The condition `code is not None` # should never happen but for safety we put it, because we have set the # download code column of `dframe` to None/nan to mark segments to update # neverthless, on the assumption that we never get response code = None # (see comment L.94). # Thus, if for some weird reason the response code is None, then update the # segment anyway (as we wanted to) dframe = dframe[dframe[col_dscode] != code] skipped_same_code += num_segments - len(dframe) if dframe.empty: # nothing to update on the db continue # update dict of default values, and set it to the dataframe: defaultvalues_nodata.update({ col_dscode: code, col_data: data }) dframe = dframe.assign( **defaultvalues_nodata) # assign returns a copy if exc is not None: # log segment errors only once per error type and data center, # otherwise the log is hundreds of Mb and it's unreadable: seg_logger.warn(request, url, code, exc) segmanager.add(dframe) segmanager.flush( ) # flush remaining stuff to insert / update, if any if skipped_dataframes: segments_df = pd.concat(skipped_dataframes, axis=0, ignore_index=True, copy=True, verify_integrity=False) skipped_dataframes = [] else: # break the next loop, if any segments_df = pd.DataFrame() segmanager.close() # flush remaining stuff to insert / update if skipped_same_code: logger.warning( formatmsg( ("%d already saved segment(s) with no waveform data skipped " "with no messages, only their count is reported " "in statistics") % skipped_same_code, "Still receiving the same download code")) return stats