def test_download_save_segments_timebounds(self, mock_updatedf, mock_insertdf, mseed_unpack, db, tt_ak135_tts): # prepare: # mseed unpack takes no starttime and endtime arguments, so that mseed_unpack.side_effect = lambda *a, **v: unpack(*a, **v) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # mock event response: it's the same as self._evt_urlread_sideeffect but modify the dates # as NOW. This means, any segment downloaded later will # be out-of-bound utcnow = datetime.utcnow() utcnow_iso = utcnow.isoformat().replace("T", " ") urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName 20160508_0000129|%s|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN 20160508_0000004|%s|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA """ % (utcnow_iso, utcnow_iso) events_df = self.get_events_df(urlread_sideeffect, db.session) # restore urlread_side_effect: urlread_sideeffect = None net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(urlread_sideeffect, db.session, self.service, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) # just to be sure. If failing, we might have changed the class default: assert len(channels_df) == 12 # events_df # id magnitude latitude longitude depth_km time # 0 20160508_0000129 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 20160508_0000004 4.0 2.0 2.0 2.0 2016-05-08 01:45:30.300 # channels_df (index not shown): # columns: # id station_id latitude longitude datacenter_id start_time end_time network station location channel # data (not aligned with columns): # 1 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHE # 2 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHN # 3 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHZ # 4 2 90.0 90.0 1 2009-01-01 NaT n1 s c1 # 5 2 90.0 90.0 1 2009-01-01 NaT n1 s c2 # 6 2 90.0 90.0 1 2009-01-01 NaT n1 s c3 # 7 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHE # 8 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHN # 9 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHZ # 10 4 90.0 90.0 2 2009-01-01 NaT n2 s c1 # 11 4 90.0 90.0 2 2009-01-01 NaT n2 s c2 # 12 4 90.0 90.0 2 2009-01-01 NaT n2 s c3 assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) chaid2mseedid = chaid2mseedid_dict(channels_df) # check that we removed the columns: assert not any(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) # take all segments: # use minmag and maxmag ttable = tt_ak135_tts segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10, minmag_radius=10, maxmag_radius=10), tttable=ttable) assert len(pd.unique(segments_df['arrival_time'])) == 2 h = 9 # segments_df (index not shown). Note that # cid sid did n s l c ed event_id depth_km time <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations # 1 1 1 GE FLT1 HHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 2 1 1 GE FLT1 HHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 3 1 1 GE FLT1 HHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 7 3 2 IA BAKI BHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 8 3 2 IA BAKI BHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 9 3 2 IA BAKI BHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 4 2 1 n1 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 5 2 1 n1 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 6 2 1 n1 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 10 4 2 n2 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 11 4 2 n2 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 12 4 2 n2 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # LEGEND: # cid = channel_id # sid = station_id # scid = datacenter_id # n, s, l, c = network, station, location, channel # ed = event_distance_deg # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) wtimespan = [1, 2] # in minutes expected = len(segments_df) # no segment on db, we should have all segments to download orig_segments_df = segments_df.copy() segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) # segments_df # COLUMNS: # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # DATA (not aligned with columns): # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # GE.FLT1..HHE 1 1 GE FLT1 HHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHN 2 1 GE FLT1 HHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHZ 3 1 GE FLT1 HHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHE 7 2 IA BAKI BHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHN 8 2 IA BAKI BHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHZ 9 2 IA BAKI BHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # n1.s..c1 4 1 n1 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c2 5 1 n1 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c3 6 1 n1 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c1 10 2 n2 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c2 11 2 n2 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c3 12 2 n2 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # self._segdata is the folder file of a "valid" 3-channel miniseed # The channels are: # Thus, no match will be found and all segments will be written with a None # download status code # setup urlread: first three rows: ok # rows[3:6]: 413, retry them # rows[6:9]: malformed_data # rows[9:12] 413, retry them # then retry: # rows[3]: empty_data # rows[4]: data_with_gaps (but seed_id should notmatch) # rows[5]: data_with_gaps (seed_id should notmatch) # rows[9]: URLError # rows[10]: Http 500 error # rows[11]: 413 # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS # self._seg_data[:2] is a way to mock data corrupted urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413, '', self._seg_data_gaps, self._seg_data_gaps, URLError("++urlerror++"), 500, 413] # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.id, Segment.channel_id, Segment.datacenter_id, Segment.download_code, Segment.maxgap_numsamples, Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id, Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time ] db_segments_df = dbquery2df(db.session.query(*cols)) assert Segment.download_id.key in db_segments_df.columns OUTTIME_ERR, OUTTIME_WARN = s2scodes.timespan_err, s2scodes.timespan_warn # assert no segment has data (time out of bounds): assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key]) == 0 # assert the number of "correctly" downloaded segments, i.e. with data (4) has now # code = TIMEBOUND_ERR assert len(db_segments_df[db_segments_df[Segment.download_code.key] == OUTTIME_ERR]) == 4 # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) # db_segments_df: # id channel_id datacenter_id download_status_code max_gap_ovlap_ratio sample_rate data_seed_id data run_id start_time end_time # 0 1 1 1 -3 0.0001 100.0 GE.FLT1..HHE b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 1 2 2 1 -3 0.0001 100.0 GE.FLT1..HHN b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 2 3 3 1 -3 0.0001 100.0 GE.FLT1..HHZ b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 6 7 7 2 200.0 NaN NaN None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 7 8 8 2 NaN NaN NaN None None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 8 9 9 2 -3 20.0 20.0 IA.BAKI..BHZ b'' 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 3 4 4 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 4 5 5 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 5 6 6 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 9 10 10 2 -1.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 10 11 11 2 500.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 11 12 12 2 413.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # now modify the first row time bounds: # first we need to assign the database id to our segments_df, to prevent # db contraint error when writing to db: # `download_save_segments` below needs toi UPDATE the segments and it does it by # checking if an id is present. # check that the channel_ids align: assert (segments_df[Segment.channel_id.key].values == db_segments_df[Segment.channel_id.key].values).all() # so that we can simply do this: segments_df[Segment.id.key] = db_segments_df[Segment.id.key] # first read the miniseed: stream = read(BytesIO(self._seg_data)) tstart = stream[0].stats.starttime.datetime tend = stream[0].stats.endtime.datetime segments_df.loc[segments_df[Segment.channel_id.key] == 1, Segment.request_start.key] = tstart segments_df.loc[segments_df[Segment.channel_id.key] == 1, Segment.request_end.key] = tstart + (tend-tstart)/2 segments_df.loc[segments_df[Segment.channel_id.key] == 2, Segment.request_start.key] = tstart segments_df.loc[segments_df[Segment.channel_id.key] == 2, Segment.request_end.key] = tend # build a segments_df of the three segments belonging to the same channel # copy at the end to avoid pandas settingwithcopy warning new_segments_df = \ segments_df.loc[segments_df[Segment.channel_id.key].isin([1, 2, 3]), :].copy() # change urlread_side_effect to provide, for the first three segments, the same # sequence of bytes. The sequence actually is OK, but in the first case it will be # PARTIALLY saved in the second case TOTALLY, and in the thrid case NOT AT ALL: urlread_sideeffect = [self._seg_data, self._seg_data, self._seg_data] # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) ztatz = self.download_save_segments(urlread_sideeffect, db.session, new_segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) db_segments_df = dbquery2df(db.session.query(*cols)) # re-sort db_segments_df to match the segments_df: ret = [db_segments_df[db_segments_df[Segment.channel_id.key] == cha] for cha in segments_df[Segment.channel_id.key]] db_segments_df = pd.concat(ret, axis=0) # assert the 1st segment whose time range has been modified has data, BUT # download_status_code still TIMEBOUNDS_ERROR df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 1, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == OUTTIME_WARN assert len(row__[Segment.data.key]) > 0 # assert the 2nd segment whose time range has been modified has data, AND # download_status_code 200 (ok) df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 2, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == 200 assert len(row__[Segment.data.key]) > 0 # assert the 3rd segment whose time range has NOT been modified has no data, # AND download_status_code is still TIMEBOUNDS_ERROR df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 3, :] assert len(df__) == 1 row__ = df__.iloc[0] assert row__[Segment.download_code.key] == OUTTIME_ERR assert len(row__[Segment.data.key]) == 0
def test_download_save_segments(self, mock_updatedf, mock_insertdf, mseed_unpack, db, tt_ak135_tts): # prepare: # mseed unpack takes no starttime and endtime arguments, so that # we do not discard any correct chunk mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) urlread_sideeffect = None # use defaults from class events_df = self.get_events_df(urlread_sideeffect, db.session) net, sta, loc, cha = [], [], [], [] datacenters_df, eidavalidator = \ self.get_datacenters_df(urlread_sideeffect, db.session, self.service, self.routing_service, net, sta, loc, cha, db_bufsize=self.db_buf_size) channels_df = self.get_channels_df(urlread_sideeffect, db.session, datacenters_df, eidavalidator, net, sta, loc, cha, None, None, 10, False, None, None, -1, self.db_buf_size) assert len(channels_df) == 12 # just to be sure. If failing, we might have changed the class default # events_df # id magnitude latitude longitude depth_km time # 0 20160508_0000129 3.0 1.0 1.0 60.0 2016-05-08 05:17:11.500 # 1 20160508_0000004 4.0 2.0 2.0 2.0 2016-05-08 01:45:30.300 # channels_df (index not shown): # columns: # id station_id latitude longitude datacenter_id start_time end_time network station location channel # data (not aligned with columns): # 1 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHE # 2 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHN # 3 1 1.0 1.0 1 2003-01-01 NaT GE FLT1 HHZ # 4 2 90.0 90.0 1 2009-01-01 NaT n1 s c1 # 5 2 90.0 90.0 1 2009-01-01 NaT n1 s c2 # 6 2 90.0 90.0 1 2009-01-01 NaT n1 s c3 # 7 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHE # 8 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHN # 9 3 1.0 1.0 2 2003-01-01 NaT IA BAKI BHZ # 10 4 90.0 90.0 2 2009-01-01 NaT n2 s c1 # 11 4 90.0 90.0 2 2009-01-01 NaT n2 s c2 # 12 4 90.0 90.0 2 2009-01-01 NaT n2 s c3 assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) chaid2mseedid = chaid2mseedid_dict(channels_df) # check that we removed the columns: assert not any(_ in channels_df.columns for _ in [Station.network.key, Station.station.key, Channel.location.key, Channel.channel.key]) # take all segments: # use minmag and maxmag ttable = tt_ak135_tts segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10, minmag_radius=10, maxmag_radius=10), tttable=ttable) assert len(pd.unique(segments_df['arrival_time'])) == 2 h = 9 # segments_df (index not shown). Note that # cid sid did n s l c ed event_id depth_km time <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations # 1 1 1 GE FLT1 HHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 2 1 1 GE FLT1 HHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 3 1 1 GE FLT1 HHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 7 3 2 IA BAKI BHE 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 8 3 2 IA BAKI BHN 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 9 3 2 IA BAKI BHZ 0.0 20160508_0000129 60.0 2016-05-08 05:17:11.500 # 4 2 1 n1 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 5 2 1 n1 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 6 2 1 n1 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 10 4 2 n2 s c1 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 11 4 2 n2 s c2 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # 12 4 2 n2 s c3 0.0 20160508_0000004 2.0 2016-05-08 01:45:30.300 # LEGEND: # cid = channel_id # sid = station_id # scid = datacenter_id # n, s, l, c = network, station, location, channel # ed = event_distance_deg # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) wtimespan = [1,2] expected = len(segments_df) # no segment on db, we should have all segments to download orig_segments_df = segments_df.copy() segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) # segments_df # COLUMNS: # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # DATA (not aligned with columns): # channel_id datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id # GE.FLT1..HHE 1 1 GE FLT1 HHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHN 2 1 GE FLT1 HHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # GE.FLT1..HHZ 3 1 GE FLT1 HHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHE 7 2 IA BAKI BHE 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHN 8 2 IA BAKI BHN 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # IA.BAKI..BHZ 9 2 IA BAKI BHZ 0.0 1 2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12 None None 1 # n1.s..c1 4 1 n1 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c2 5 1 n1 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n1.s..c3 6 1 n1 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c1 10 2 n2 s c1 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c2 11 2 n2 s c2 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # n2.s..c3 12 2 n2 s c3 0.0 2 2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31 None None 1 # self._segdata is the folder file of a "valid" 3-channel miniseed # The channels are: # Thus, no match will be found and all segments will be written with a None # download status code # setup urlread: first three rows: ok # rows[3:6]: 413, retry them # rows[6:9]: malformed_data # rows[9:12] 413, retry them # then retry: # rows[3]: empty_data # rows[4]: data_with_gaps (but seed_id should notmatch) # rows[5]: data_with_gaps (seed_id should notmatch) # rows[9]: URLError # rows[10]: Http 500 error # rows[11]: 413 # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS # self._seg_data[:2] is a way to mock data corrupted urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413, '', self._seg_data_gaps, self._seg_data_gaps, URLError("++urlerror++"), 500, 413] # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.id, Segment.channel_id, Segment.datacenter_id, Segment.download_code, Segment.maxgap_numsamples, \ Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id, Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time ] db_segments_df = dbquery2df(db.session.query(*cols)) assert Segment.download_id.key in db_segments_df.columns # change data column otherwise we cannot display db_segments_df. # When there is data just print "data" db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data' # assert we have 4 segments with "data" properly set: assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key]) == 4 # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) # db_segments_df: # id channel_id datacenter_id download_status_code max_gap_ovlap_ratio sample_rate data_seed_id data run_id start_time end_time # 0 1 1 1 200.0 0.0001 100.0 GE.FLT1..HHE data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 1 2 2 1 200.0 0.0001 100.0 GE.FLT1..HHN data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 2 3 3 1 200.0 0.0001 100.0 GE.FLT1..HHZ data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 6 7 7 2 200.0 NaN NaN None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 7 8 8 2 NaN NaN NaN None None 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 8 9 9 2 200.0 20.0 20.0 IA.BAKI..BHZ data 1 2016-05-08 05:16:12 2016-05-08 05:19:12 # 3 4 4 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 4 5 5 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 5 6 6 1 -2.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 9 10 10 2 -1.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 10 11 11 2 500.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 # 11 12 12 2 413.0 NaN NaN None None 1 2016-05-08 01:44:31 2016-05-08 01:47:31 assert len(ztatz) == len(datacenters_df) assert len(db_segments_df) == len(segments_df) assert mock_updatedf.call_count == 0 dsc = db_segments_df[Segment.download_code.key] exp_dsc = np.array([200, 200, 200, 200, np.nan, 200, -2, -2, -2, -1, 500, 413]) assert ((dsc == exp_dsc) | (np.isnan(dsc) & np.isnan(exp_dsc))).all() # as we have 12 segments and a buf size of self.db_buf_size(=1, but it might change in the # future), this below is two # it might change if we changed the buf size in the future # test that we correctly called mock_insertdf. Note that we assume that the # latter is called ONLY inside DbManager. To test that, as the number of stuff # to be added (length of the dataframes) varies, we need to implement a counter here: mock_insertdf_call_count = 0 _bufzise = 0 for c in mock_insertdf.call_args_list: c_args = c[0] df_ = c_args[0] _bufzise += len(df_) if _bufzise >= self.db_buf_size: mock_insertdf_call_count += 1 _bufzise = 0 assert mock_insertdf.call_count == mock_insertdf_call_count # assert data is consistent COL = Segment.data.key assert (db_segments_df.iloc[:3][COL] == b'data').all() assert (db_segments_df.iloc[3:4][COL] == b'').all() assert pd.isnull(db_segments_df.iloc[4:5][COL]).all() assert (db_segments_df.iloc[5:6][COL] == b'data').all() assert pd.isnull(db_segments_df.iloc[6:][COL]).all() # assert downdload status code is consistent URLERR_CODE, MSEEDERR_CODE = s2scodes.url_err, s2scodes.mseed_err # also this asserts that we grouped for dc starttime endtime COL = Segment.download_code.key assert (db_segments_df.iloc[:4][COL] == 200).all() assert pd.isnull(db_segments_df.iloc[4:5][COL]).all() assert (db_segments_df.iloc[5:6][COL] == 200).all() assert (db_segments_df.iloc[6:9][COL] == MSEEDERR_CODE).all() assert (db_segments_df.iloc[9][COL] == URLERR_CODE).all() assert (db_segments_df.iloc[10][COL] == 500).all() assert (db_segments_df.iloc[11][COL] == 413).all() # assert gaps are only in the given position COL = Segment.maxgap_numsamples.key assert (db_segments_df.iloc[:3][COL] < 0.01).all() assert pd.isnull(db_segments_df.iloc[3:5][COL]).all() assert (db_segments_df.iloc[5][COL] == 20).all() assert pd.isnull(db_segments_df.iloc[6:][COL]).all() # now mock retry: segments_df, request_timebounds_need_update = \ prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan, retry_seg_not_found=True, retry_url_err=True, retry_mseed_err=True, retry_client_err=True, retry_server_err=True, retry_timespan_err=True, retry_timespan_warn=True) assert request_timebounds_need_update is False COL = Segment.download_code.key mask = (db_segments_df[COL] >= 400) | pd.isnull(db_segments_df[COL]) \ | (db_segments_df[COL].isin([URLERR_CODE, MSEEDERR_CODE])) assert len(segments_df) == len(db_segments_df[mask]) urlread_sideeffect = [413] mock_updatedf.reset_mock() mock_insertdf.reset_mock() # define a dc_dataselect_manager for open data only: dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False) # Let's go: ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df, dc_dataselect_manager, chaid2mseedid, self.run.id, False, request_timebounds_need_update, 1, 2, 3, db_bufsize=self.db_buf_size) # get columns from db which we are interested on to check cols = [Segment.download_code, Segment.channel_id] db_segments_df = dbquery2df(db.session.query(*cols)) # change data column otherwise we cannot display db_segments_df. When there is data # just print "data" # db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) & # (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data' # re-sort db_segments_df to match the segments_df: ret = [] for cha in segments_df[Segment.channel_id.key]: ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha]) db_segments_df = pd.concat(ret, axis=0) assert (db_segments_df[COL] == 413).all() assert len(ztatz) == len(datacenters_df) assert len(db_segments_df) == len(segments_df) # same as above: but with updatedf: test that we correctly called mock_insertdf_napkeys. # Note that we assume that the latter is called ONLY inside download.main.DbManager. # To test that, as the number of stuff to be added (length of the dataframes) varies, # we need to implement a counter here: mock_updatedf_call_count = 0 _bufzise = 0 for c in mock_updatedf.call_args_list: c_args = c[0] df_ = c_args[0] _bufzise += len(df_) if _bufzise >= self.db_buf_size: mock_updatedf_call_count += 1 _bufzise = 0 assert mock_updatedf.call_count == mock_updatedf_call_count assert mock_insertdf.call_count == 0
def test_retry2(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) RESPONSES = [URLError('abc')] mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments(RESPONSES, *a, **v) # mseed unpack is mocked by accepting only first arg (so that time bounds are not # considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(['a:b', 'c:d'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: # USERPASS good for both datacenter: mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() mock_get_opener.reset_mock() mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(['uzer:pazzword', 'uzer:pazzword'], *a, **kw) yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) # seg_df: # id download_code queryauth download_id # 1 -1 True 2 # 2 -1 True 2 # 3 -1 True 2 # 4 -1 True 2 # 5 -1 True 2 # 6 -1 True 2 # 7 -1 True 2 # 8 -1 True 2 # 9 -1 True 2 # 10 -1 True 2 # 11 -1 True 2 # 12 -1 True 2 urlerr, mseederr = s2scodes.url_err, s2scodes.mseed_err # according to our mock, we should have all urlerr codes: assert (seg_df[Segment.download_code.key] == urlerr).all() assert (seg_df[Segment.queryauth.key] == True).all() DOWNLOADID = 2 assert (seg_df[Segment.download_id.key] == DOWNLOADID).all() # other assertions: assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # assert we print that we are downloading open and restricted data: assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db', result.output) assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called # no credentials failed: assert "Downloading open data only from: " not in result.output # Ok, test retry: new_seg_df = seg_df.copy() # first get run id # we have 12 segments, change the download codes. The second boolean # value denotes queryauth (True or False): code_queryauth = [(204, False), (204, True), (404, False), (404, True), (401, False), (401, True), (403, False), (403, True), (400, True), (400, False), (None, False), (None, True)] for id_, (dc_, qa_) in zip(seg_df[Segment.id.key].tolist(), code_queryauth): seg = db.session.query(Segment).filter(Segment.id == id_).first() seg.download_code = dc_ seg.queryauth = qa_ # set expected values (see also yamlfile below) # remember that any segment download will give urlerr as code expected_new_download_code = dc_ expected_download_id = DOWNLOADID if dc_ in (204, 404, 401, 403) and qa_ is False: # to retry becaue they failed due to authorization problems # (or most likely they did) expected_new_download_code = urlerr expected_download_id = DOWNLOADID + 1 elif dc_ is None or (dc_ < 400 and dc_ >= 500): # to retry because of the flags (see yamlfile below) expected_new_download_code = urlerr expected_download_id = DOWNLOADID + 1 expected_query_auth = qa_ if dc_ == 400 else True new_seg_df.loc[new_seg_df[Segment.id.key] == id_, :] = \ (id_, expected_new_download_code, expected_query_auth, expected_download_id) db.session.commit() # re-download and check what we have retried: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_seg_not_found=True, retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) DOWNLOADID += 1 assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df2 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) # seg_df2: # id download_code queryauth download_id # 1 -1 True 3 # 2 204 True 2 # 3 -1 True 3 # 4 404 True 2 # 5 -1 True 3 # 6 401 True 2 # 7 -1 True 3 # 8 403 True 2 # 9 400 True 2 # 10 400 False 2 # 11 -1 True 3 # 12 -1 True 3 pd.testing.assert_frame_equal(seg_df2, new_seg_df) # Another retry without modifyiung the segments but setting retry_client_err to True # re-download and check what we have retried: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_seg_not_found=True, retry_client_err=True) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) DOWNLOADID += 1 assert clirunner.ok(result) # get db data, sort by index and reset index to assure comparison across data frames: seg_df3 = dbquery2df(db.session.query(Segment.id, Segment.download_code, Segment.queryauth, Segment.download_id))\ .sort_values(by=[Segment.id.key]).reset_index(drop=True) expected_df = seg_df2.copy() # modify all 4xx codes as they are updated. Note that old urlerr codes have the old # download id (do not override) old_4xx = expected_df[Segment.download_code.key].between(400, 499.999) expected_df.loc[old_4xx, Segment.download_id.key] = DOWNLOADID expected_df.loc[old_4xx, Segment.queryauth.key] = True expected_df.loc[old_4xx, Segment.download_code.key] = urlerr # seg_df3: # id download_code queryauth download_id # 1 -1 True 3 # 2 204 True 2 # 3 -1 True 3 # 4 -1 True 4 # 5 -1 True 3 # 6 -1 True 4 # 7 -1 True 3 # 8 -1 True 4 # 9 -1 True 4 # 10 -1 True 4 # 11 -1 True 3 # 12 -1 True 3 pd.testing.assert_frame_equal(seg_df3, expected_df) old_urlerr_segids = seg_df2[seg_df2[Segment.download_code.key] == urlerr][Segment.id.key] new_urlerr_df = expected_df[expected_df[Segment.id.key].isin(old_urlerr_segids)] assert (new_urlerr_df[Segment.download_id.key] == 3).all()
def test_retry(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments([URLError('abc')], *a, **v) # mseed unpack is mocked by accepting only first arg (so that time bounds are # not considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # mock our opener m = Mock() mockopen = Mock() mockopen.read = lambda *a, **v: b'' mockopen.msg = 'abc' mockopen.code = 204 m.open = lambda *a, **v: mockopen # m.read = lambda *a, **v: '' mock_get_opener.side_effect = lambda *a, **v: m # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: # launch two download runs with different responses for token auth query: for tokenquery_mocked_return_values, dc_token_failed in \ ([[URLError('a'), 'uzer:pazzword'], "http://geofon.gfz-potsdam.de"], [['uzer:pazzword', URLError('a')], 'http://ws.resif.fr']): # set how many times self.mock_urlopen has been called: mock_urlopen_call_count = self.mock_urlopen.call_count # TEST 2: USERPASS good for just one datacenter: mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() mock_get_opener.reset_mock() mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token(tokenquery_mocked_return_values, *a, **kw) yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile), retry_client_err=False) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) assert 'restricted_data: %s' % os.path.abspath(tokenfile) in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # assert we print that we are downloading open and restricted data: assert re.search(r'STEP 7 of 8\: Downloading \d+ segments and saving to db', result.output) assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called assert "Downloading open data only from: %s" % dc_token_failed dc_token_ok = 'http://ws.resif.fr' \ if dc_token_failed == "http://geofon.gfz-potsdam.de" else \ "http://geofon.gfz-potsdam.de" assert mock_get_opener.call_count == 1 assert mock_get_opener.call_args_list[0][0][:] == (dc_token_ok, 'uzer', 'pazzword') dc_id = {Fdsnws(i[1]).site: i[0] for i in db.session.query(DataCenter.id, DataCenter.dataselect_url)} # assert urlopen has been called only once with query and not queryauth: # get the segments dataframe we (re)downloaded: segments_df_to_download = mock_download_save_segments.call_args_list[-1][0][1] dc2download = pd.unique(segments_df_to_download['datacenter_id']).tolist() # set the expected call count based on the datacenters of (re)downloaded segments: if dc_id[dc_token_failed] not in dc2download: assert self.mock_urlopen.call_count == 0 else: assert self.mock_urlopen.call_count >= 1 for i in range(self.mock_urlopen.call_count): i+=1 assert self.mock_urlopen.call_args_list[-i][0][0].get_full_url() == \ dc_token_failed + "/fdsnws/dataselect/1/query"
def test_restricted(self, mock_get_opener, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments(None, *a, **v) # mseed unpack is mocked by accepting only first arg # (so that time bounds are not considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = \ lambda *a, **kw: self.dc_get_data_from_token([URLError('a'), 'abc'], *a, **kw) # TEST 1: provide a file with valid token: tokenfile = pytestdir.newfile(create=True) with open(tokenfile, 'w') as fh: fh.write('BEGIN PGP MESSAGE') # mock yaml_load to override restricted_data: yaml_file = yamlfile(restricted_data=os.path.abspath(tokenfile)) # The run table is populated with a run_id in the constructor of this class # for checking run_ids, store here the number of runs we have in the table: runs = len(db.session.query(Download.id).all()) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) assert 'Downloading 12 segments (open data only)' in result.output assert 'STEP 5 of 8: Acquiring credentials from token' in result.output # note that due to (probably) dict order in py2-3 we need to test both of these: if not ('Downloading open data only from: http://geofon.gfz-potsdam.de, ' 'http://ws.resif.fr (Unable to acquire credentials for restricted data)') in \ result.output: assert ('Downloading open data only from: http://ws.resif.fr, ' 'http://geofon.gfz-potsdam.de (Unable to acquire credentials for restricted data)') in \ result.output # assert we print that we are downloading open data only (all errors): assert 'STEP 7 of 8: Downloading 12 segments (open data only)' in result.output assert not mock_get_data_open.called assert mock_get_data_from_token.called assert not mock_get_data_from_userpass.called assert not mock_get_opener.called # some assertions to check data properly written # These are important because they confirm that data has been downloaded anyway # (the test does not differentiate between restricted or open data) assert len(db.session.query(Download.id).all()) == runs + 1 runs += 1 segments = db.session.query(Segment).all() assert len(segments) == 12 segments = db.session.query(Segment).filter(Segment.has_data).all() assert len(segments) == 4 assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2 assert mock_updatedf.called # called while saving inventories assert mock_insertdf.called
def test_opendata_and_errors(self, mock_get_data_from_token, mock_get_data_from_userpass, mock_get_data_open, mock_updatedf, mock_insertdf, mock_mseed_unpack, mock_download_save_segments, mock_save_inventories, mock_get_channels_df, mock_get_datacenters_df, mock_get_events_df, # fixtures: db, clirunner, pytestdir, yamlfile): mock_get_events_df.side_effect = lambda *a, **v: self.get_events_df(None, *a, **v) mock_get_datacenters_df.side_effect = \ lambda *a, **v: self.get_datacenters_df(None, *a, **v) mock_get_channels_df.side_effect = lambda *a, **v: self.get_channels_df(None, *a, **v) mock_save_inventories.side_effect = lambda *a, **v: self.save_inventories(None, *a, **v) mock_download_save_segments.side_effect = \ lambda *a, **v: self.download_save_segments(None, *a, **v) # mseed unpack is mocked by accepting only first arg # (so that time bounds are not considered) mock_mseed_unpack.side_effect = lambda *a, **v: unpack(a[0]) mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v) mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v) # prevlen = len(db.session.query(Segment).all()) # patching class methods while preserving the original call requires storing once # the original methods (as class attributes). Sets the side effect of the mocked method # to those class attributes as to preserve the original functionality # and be able to assert mock_* functions are called and so on # For info see: # https://stackoverflow.com/a/29563665 mock_get_data_open.side_effect = self.dc_get_data_open mock_get_data_from_userpass.side_effect = self.dc_get_data_from_userpass mock_get_data_from_token.side_effect = self.dc_get_data_from_token # TEST 1: NORMAL CASE (NO AUTH): # mock yaml_load to override restricted_data: yaml_file = yamlfile(restricted_data='') # The run table is populated with a run_id in the constructor of this class # for checking run_ids, store here the number of runs we have in the table: runs = len(db.session.query(Download.id).all()) result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert clirunner.ok(result) assert 'Downloading 12 segments (open data only)' in result.output assert mock_get_data_open.called assert not mock_get_data_from_token.called assert not mock_get_data_from_userpass.called # some assertions to check data properly written assert len(db.session.query(Download.id).all()) == runs + 1 runs += 1 segments = db.session.query(Segment).all() assert len(segments) == 12 segments = db.session.query(Segment).filter(Segment.has_data).all() assert len(segments) == 4 assert len(db.session.query(Station).filter(Station.has_inventory).all()) == 2 assert mock_updatedf.called # called while saving inventories assert mock_insertdf.called # TEST 1: USERPASS AND EIDA (PROBLEM): # test that we provide userpass and eida: error: # mock yaml_load to override restricted_data: mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() yaml_file = yamlfile(restricted_data=['user', 'password'], dataws='eida') result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert not clirunner.ok(result) assert ('Error: Invalid value for "restricted_data": ' 'downloading from EIDA requires a token') in result.output # TEST 2: TOKEN FILE NOT EXISTING mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() yaml_file = yamlfile(restricted_data='abcdg465du97_Sdr4fvssgflero', dataws='eida') result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert not clirunner.ok(result) assert ('Invalid token. If you passed a file path') in result.output # TEST 2: TOKEN FILE EXISTS, INVALID (e.g. empty) filepath = pytestdir.newfile(create=True) mock_get_data_open.reset_mock() mock_get_data_from_token.reset_mock() mock_get_data_from_userpass.reset_mock() yaml_file = yamlfile(restricted_data=os.path.abspath(filepath), dataws='eida') result = clirunner.invoke(cli, ['download', '-c', yaml_file, '--dburl', db.dburl, '--start', '2016-05-08T00:00:00', '--end', '2016-05-08T9:00:00']) assert not clirunner.ok(result) assert ('Invalid token. If you passed a file path') in result.output