def test_prepare_for_download(self, db, tt_ak135_tts):
        # prepare:
        urlread_sideeffect = None  # use defaults from class
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df, eidavalidator, net,
                                           sta, loc, cha, None, None, 100,
                                           False, None, None, -1,
                                           self.db_buf_size)
        assert len(
            channels_df
        ) == 12  # just to be sure. If failing, we might have changed the class default
        # events_df
        #    id  magnitude  latitude  longitude  depth_km                    time
        # 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
        # 1  2   4.0        90.0      90.0       2.0      2016-05-08 01:45:30.300

        # channels_df:
        #    id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
        # 0  1   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHE
        # 1  2   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHN
        # 2  3   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHZ
        # 3  4   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c1
        # 4  5   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c2
        # 5  6   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c3
        # 6   7   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHE
        # 7   8   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHN
        # 8   9   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHZ
        # 9   10  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c1
        # 10  11  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c2
        # 11  12  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c3

        # take all segments:
        segments_df = merge_events_stations(events_df,
                                            channels_df,
                                            dict(minmag=10,
                                                 maxmag=10,
                                                 minmag_radius=100,
                                                 maxmag_radius=200),
                                            tttable=tt_ak135_tts)

        # segments_df:
        #    channel_id  station_id  datacenter_id network station location channel  event_distance_deg  event_id  depth_km                    time               arrival_time
        # 0  1           1           1              GE      FLT1             HHE     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 1  2           1           1              GE      FLT1             HHN     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 2  3           1           1              GE      FLT1             HHZ     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 3  4           2           1              n1      s                c1      89.000              1         60.0     2016-05-08 05:17:11.500 NaT
        # 4  5           2           1              n1      s                c2      89.000              1         60.0     2016-05-08 05:17:11.500 NaT
        # 5  6           2           1              n1      s                c3      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 6  7           3           2              IA      BAKI             BHE     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 7  8           3           2              IA      BAKI             BHN     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 8  9           3           2              IA      BAKI             BHZ     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 9  10          4           2              n2      s                c1      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 10  11          4           2              n2      s                c2      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 11  12          4           2              n2      s                c3      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 12  1           1           1              GE      FLT1             HHE     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 13  2           1           1              GE      FLT1             HHN     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 14  3           1           1              GE      FLT1             HHZ     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 15  4           2           1              n1      s                c1      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 16  5           2           1              n1      s                c2      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 17  6           2           1              n1      s                c3      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 18  7           3           2              IA      BAKI             BHE     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 19  8           3           2              IA      BAKI             BHN     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 20  9           3           2              IA      BAKI             BHZ     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 21  10          4           2              n2      s                c1      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 22  11          4           2              n2      s                c2      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 23  12          4           2              n2      s                c3      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT

        # make a copy of evts_stations_df cause we will modify in place the data frame
        #         segments_df =  self.get_arrivaltimes(urlread_sideeffect, evts_stations_df.copy(),
        #                                                    [1,2], ['P', 'Q'],
        #                                                         'ak135', mp_max_workers=1)

        expected = len(
            segments_df
        )  # no segment on db, we should have all segments to download
        wtimespan = [1, 2]
        assert Segment.id.key not in segments_df.columns
        assert Segment.download_id.key not in segments_df.columns
        orig_seg_df = segments_df.copy()
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df,
                                                    Authorizer(None), False)
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_seg_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)
        assert request_timebounds_need_update is False

        # segments_df: (not really the real dataframe, some columns are removed but relevant data is ok):
        #    channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time
        # 0  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 1  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 2  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 3  4           1              n1      s                c1      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 4  5           1              n1      s                c2      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 5  6           1              n1      s                c3      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 6  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 7  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 8  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 9  10          2              n2      s                c1      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 10  11          2              n2      s                c2      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 11  12          2              n2      s                c3      89.0                1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12
        # 12  1           1              GE      FLT1             HHE     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 13  2           1              GE      FLT1             HHN     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 14  3           1              GE      FLT1             HHZ     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 15  4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 16  5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 17  6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 18  7           2              IA      BAKI             BHE     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 19  8           2              IA      BAKI             BHN     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 20  9           2              IA      BAKI             BHZ     89.0                2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 21  10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 22  11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31
        # 23  12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31

        assert Segment.id.key in segments_df.columns
        assert Segment.download_id.key not in segments_df.columns
        assert len(segments_df) == expected
        # assert len(db.session.query(Segment.id).all()) == len(segments_df)

        assert all(x[0] is None
                   for x in db.session.query(Segment.download_code).all())
        assert all(x[0] is None for x in db.session.query(Segment.data).all())

        # mock an already downloaded segment.
        # Set the first 7 to have a particular download status code
        urlerr, mseederr, outtime_err, outtime_warn = \
            s2scodes.url_err, s2scodes.mseed_err, s2scodes.timespan_err, s2scodes.timespan_warn
        downloadstatuscodes = [
            None, urlerr, mseederr, 413, 505, outtime_err, outtime_warn
        ]
        for i, download_code in enumerate(downloadstatuscodes):
            dic = segments_df.iloc[i].to_dict()
            dic['download_code'] = download_code
            dic['download_id'] = self.run.id
            # hack for deleting unused columns:
            for col in [
                    Station.network.key, Station.station.key,
                    Channel.location.key, Channel.channel.key
            ]:
                if col in dic:
                    del dic[col]
            # convet numpy values to python scalars:
            # pandas 20+ seems to keep numpy types in to_dict
            # https://github.com/pandas-dev/pandas/issues/13258
            # this was not the case in pandas 0.19.2
            # sql alchemy does not like that
            # (Curiosly, our pd2sql methods still work fine (we should check why)
            # So, quick and dirty:
            for k in dic.keys():
                if hasattr(dic[k], "item"):
                    dic[k] = dic[k].item()
            # postgres complains about nan primary keys
            if math.isnan(dic.get(Segment.id.key, 0)):
                del dic[Segment.id.key]

            # now we can safely add it:
            db.session.add(Segment(**dic))

        db.session.commit()

        assert len(db.session.query(
            Segment.id).all()) == len(downloadstatuscodes)

        # Now we have an instance of all possible errors on the db (5 in total) and a new
        # instance (not on the db). Assure all work:
        # set the num of instances to download anyway. Their number is the not saved ones, i.e.:
        to_download_anyway = len(segments_df) - len(downloadstatuscodes)
        for c in product([True, False], [True, False], [True, False],
                         [True, False], [True, False], [True, False],
                         [True, False]):
            s_df, request_timebounds_need_update = \
                prepare_for_download(db.session, orig_seg_df, dc_dataselect_manager, wtimespan,
                                     retry_seg_not_found=c[0],
                                     retry_url_err=c[1],
                                     retry_mseed_err=c[2],
                                     retry_client_err=c[3],
                                     retry_server_err=c[4],
                                     retry_timespan_err=c[5],
                                     retry_timespan_warn=c[6])
            to_download_in_this_case = sum(
                c)  # count the True's (bool sum works in python)
            assert len(s_df) == to_download_anyway + to_download_in_this_case
            assert request_timebounds_need_update is False

        # now change the window time span and see that everything is to be downloaded again:
        # do it for any retry combinations, as it should ALWAYS return "everything has to be
        # re-downloaded"
        wtimespan[1] += 5
        for c in product([True, False], [True, False], [True, False],
                         [True, False], [True, False], [True, False],
                         [True, False]):
            s_df, request_timebounds_need_update = \
                prepare_for_download(db.session, orig_seg_df, dc_dataselect_manager, wtimespan,
                                     retry_seg_not_found=c[0],
                                     retry_url_err=c[1],
                                     retry_mseed_err=c[2],
                                     retry_client_err=c[3],
                                     retry_server_err=c[4],
                                     retry_timespan_err=c[5],
                                     retry_timespan_warn=c[6])
            assert len(s_df) == len(orig_seg_df)
            assert request_timebounds_need_update is True  # because we changed wtimespan
        # this hol

        # now test that we raise a NothingToDownload
        # first, write all remaining segments to db, with 204 code so they will not be
        # re-downloaded
        for i in range(len(segments_df)):
            download_code = 204
            dic = segments_df.iloc[i].to_dict()
            dic['download_code'] = download_code
            dic['download_id'] = self.run.id
            # hack for deleting unused columns:
            for col in [
                    Station.network.key, Station.station.key,
                    Channel.location.key, Channel.channel.key
            ]:
                if col in dic:
                    del dic[col]
            # convet numpy values to python scalars:
            # pandas 20+ seems to keep numpy types in to_dict
            # https://github.com/pandas-dev/pandas/issues/13258
            # this was not the case in pandas 0.19.2
            # sql alchemy does not like that
            # (Curiosly, our pd2sql methods still work fine (we should check why)
            # So, quick and dirty:
            for k in dic.keys():
                if hasattr(dic[k], "item"):
                    dic[k] = dic[k].item()
            # postgres complains about nan primary keys
            if math.isnan(dic.get(Segment.id.key, 0)):
                del dic[Segment.id.key]

            # now we can safely add it:
            # brutal approach: add and commit, if error, rollback
            # if error it means we already wrote the segment on db and an uniqueconstraint
            # is raised
            try:
                db.session.add(Segment(**dic))
                db.session.commit()
            except SQLAlchemyError as _err:
                db.session.rollback()
        # test that we have the correct number of segments saved:
        assert db.session.query(Segment.id).count() == len(orig_seg_df)
        # try to test a NothingToDownload:
        # reset the old wtimespan otherwise everything will be flagged to be redownloaded:
        wtimespan[1] -= 5
        with pytest.raises(NothingToDownload):
            segments_df, request_timebounds_need_update = \
                prepare_for_download(db.session, orig_seg_df, dc_dataselect_manager, wtimespan,
                                     retry_seg_not_found=False,
                                     retry_url_err=False,
                                     retry_mseed_err=False,
                                     retry_client_err=False,
                                     retry_server_err=False,
                                     retry_timespan_err=False,
                                     retry_timespan_warn=False)
    def test_prepare_for_download_sametimespans(self, db, tt_ak135_tts):
        # prepare. event ws returns two events very close by
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|2.01|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 05:17:12.300000|1.001|1.001|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        net, sta, loc, cha = [], [], [], []
        urlread_sideeffect = None
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df, eidavalidator, net,
                                           sta, loc, cha, None, None, 100,
                                           False, None, None, -1,
                                           self.db_buf_size)
        assert len(
            channels_df
        ) == 12  # just to be sure. If failing, we might have changed the class default
        # events_df
        #    id  magnitude  latitude  longitude  depth_km                    time
        # 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
        # 1  2   4.0        90.0      90.0       2.0      2016-05-08 01:45:30.300

        # channels_df:
        #    id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
        # 0  1   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHE
        # 1  2   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHN
        # 2  3   1           1.0       1.0        1             2003-01-01 NaT       GE      FLT1             HHZ
        # 3  4   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c1
        # 4  5   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c2
        # 5  6   2           90.0      90.0       1             2009-01-01 NaT       n1      s                c3
        # 6   7   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHE
        # 7   8   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHN
        # 8   9   3           1.0       1.0        2             2003-01-01 NaT       IA      BAKI             BHZ
        # 9   10  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c1
        # 10  11  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c2
        # 11  12  4           90.0      90.0       2             2009-01-01 NaT       n2      s                c3

        # take all segments:
        segments_df = merge_events_stations(events_df,
                                            channels_df,
                                            dict(minmag=10,
                                                 maxmag=10,
                                                 minmag_radius=100,
                                                 maxmag_radius=200),
                                            tttable=tt_ak135_tts)

        # segments_df:
        #    channel_id  station_id  datacenter_id network station location channel  event_distance_deg  event_id  depth_km                    time               arrival_time
        # 0  1           1           1              GE      FLT1             HHE     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 1  2           1           1              GE      FLT1             HHN     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 2  3           1           1              GE      FLT1             HHZ     500.555             1         60.0     2016-05-08 05:17:11.500 2017-05-10 12:39:13.463745
        # 3  4           2           1              n1      s                c1      89.000              1         60.0     2016-05-08 05:17:11.500 NaT
        # 4  5           2           1              n1      s                c2      89.000              1         60.0     2016-05-08 05:17:11.500 NaT
        # 5  6           2           1              n1      s                c3      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 6  7           3           2              IA      BAKI             BHE     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 7  8           3           2              IA      BAKI             BHN     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 8  9           3           2              IA      BAKI             BHZ     0.0                 1         60.0     2016-05-08 05:17:11.500 NaT
        # 9  10          4           2              n2      s                c1      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 10  11          4           2              n2      s                c2      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 11  12          4           2              n2      s                c3      89.0                1         60.0     2016-05-08 05:17:11.500 NaT
        # 12  1           1           1              GE      FLT1             HHE     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 13  2           1           1              GE      FLT1             HHN     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 14  3           1           1              GE      FLT1             HHZ     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 15  4           2           1              n1      s                c1      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 16  5           2           1              n1      s                c2      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 17  6           2           1              n1      s                c3      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 18  7           3           2              IA      BAKI             BHE     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 19  8           3           2              IA      BAKI             BHN     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 20  9           3           2              IA      BAKI             BHZ     89.0                2         2.0      2016-05-08 01:45:30.300 NaT
        # 21  10          4           2              n2      s                c1      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 22  11          4           2              n2      s                c2      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT
        # 23  12          4           2              n2      s                c3      0.0                 2         2.0      2016-05-08 01:45:30.300 NaT

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df,
                                                    Authorizer(None), False)

        expected = len(
            segments_df
        )  # no segment on db, we should have all segments to download
        wtimespan = [1, 2]
        assert Segment.id.key not in segments_df.columns
        assert Segment.download_id.key not in segments_df.columns
        orig_seg_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_seg_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

        assert request_timebounds_need_update is False

        logmsg = self.log_msg()
        # the dupes should be the number of segments divided by the events set (2) which are
        # very close
        expected_dupes = len(segments_df) / len(events_df)
        assert ("%d suspiciously duplicated segments found" %
                expected_dupes) in logmsg
    def test_download_save_segments_timebounds(self, mock_updatedf, mock_insertdf, mseed_unpack,
                                               db, tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        mseed_unpack.side_effect = lambda *a, **v: unpack(*a, **v)
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        # mock event response: it's the same as self._evt_urlread_sideeffect but modify the dates
        # as NOW. This means, any segment downloaded later will
        # be out-of-bound
        utcnow = datetime.utcnow()
        utcnow_iso = utcnow.isoformat().replace("T", " ")
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|%s|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|%s|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
""" % (utcnow_iso, utcnow_iso)
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        # restore urlread_side_effect:
        urlread_sideeffect = None
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        # just to be sure. If failing, we might have changed the class default:
        assert len(channels_df) == 12
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1, 2]  # in minutes
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples,
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        OUTTIME_ERR, OUTTIME_WARN = s2scodes.timespan_err, s2scodes.timespan_warn
        # assert no segment has data (time out of bounds):
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 0
        # assert the number of "correctly" downloaded segments, i.e. with data (4) has now
        # code = TIMEBOUND_ERR
        assert len(db_segments_df[db_segments_df[Segment.download_code.key] == OUTTIME_ERR]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              -3                    0.0001               100.0        GE.FLT1..HHE    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              -3                    0.0001               100.0        GE.FLT1..HHN    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              -3                    0.0001               100.0        GE.FLT1..HHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              -3                 20.0                 20.0         IA.BAKI..BHZ    b''   1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        # now modify the first row time bounds:
        # first we need to assign the database id to our segments_df, to prevent
        # db contraint error when writing to db:
        # `download_save_segments` below needs toi UPDATE the segments and it does it by
        # checking if an id is present.
        # check that the channel_ids align:
        assert (segments_df[Segment.channel_id.key].values ==
                db_segments_df[Segment.channel_id.key].values).all()
        # so that we can simply do this:
        segments_df[Segment.id.key] = db_segments_df[Segment.id.key]

        # first read the miniseed:
        stream = read(BytesIO(self._seg_data))
        tstart = stream[0].stats.starttime.datetime
        tend = stream[0].stats.endtime.datetime
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 1,
                        Segment.request_end.key] = tstart + (tend-tstart)/2

        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_start.key] = tstart
        segments_df.loc[segments_df[Segment.channel_id.key] == 2,
                        Segment.request_end.key] = tend

        # build a segments_df of the three segments belonging to the same channel
        # copy at the end to avoid pandas settingwithcopy warning
        new_segments_df = \
            segments_df.loc[segments_df[Segment.channel_id.key].isin([1, 2, 3]), :].copy()
        # change urlread_side_effect to provide, for the first three segments, the same
        # sequence of bytes. The sequence actually is OK, but in the first case it will be
        # PARTIALLY saved in the second case TOTALLY, and in the thrid case NOT AT ALL:
        urlread_sideeffect = [self._seg_data, self._seg_data, self._seg_data]
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, new_segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        db_segments_df = dbquery2df(db.session.query(*cols))
        # re-sort db_segments_df to match the segments_df:
        ret = [db_segments_df[db_segments_df[Segment.channel_id.key] == cha]
               for cha in segments_df[Segment.channel_id.key]]
        db_segments_df = pd.concat(ret, axis=0)

        # assert the 1st segment whose time range has been modified has data, BUT
        # download_status_code still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 1, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_WARN
        assert len(row__[Segment.data.key]) > 0

        # assert the 2nd segment whose time range has been modified has data, AND
        # download_status_code 200 (ok)
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 2, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == 200
        assert len(row__[Segment.data.key]) > 0

        # assert the 3rd segment whose time range has NOT been modified has no data,
        # AND download_status_code is still TIMEBOUNDS_ERROR
        df__ = db_segments_df.loc[db_segments_df[Segment.channel_id.key] == 3, :]
        assert len(df__) == 1
        row__ = df__.iloc[0]
        assert row__[Segment.download_code.key] == OUTTIME_ERR
        assert len(row__[Segment.data.key]) == 0
    def test_download_save_segments(self, mock_updatedf, mock_insertdf, mseed_unpack, db,
                                    tt_ak135_tts):
        # prepare:
        # mseed unpack takes no starttime and endtime arguments, so that
        # we do not discard any correct chunk
        mseed_unpack.side_effect = lambda *a, **v: unpack(a[0])
        mock_insertdf.side_effect = lambda *a, **v: insertdf(*a, **v)
        mock_updatedf.side_effect = lambda *a, **v: updatedf(*a, **v)

        urlread_sideeffect = None  # use defaults from class
        events_df = self.get_events_df(urlread_sideeffect, db.session)
        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(urlread_sideeffect, db.session, self.service,
                                    self.routing_service, net, sta, loc, cha,
                                    db_bufsize=self.db_buf_size)
        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 12  # just to be sure. If failing, we might have changed the class default
    # events_df
#                  id  magnitude  latitude  longitude  depth_km  time
# 0  20160508_0000129        3.0       1.0        1.0      60.0  2016-05-08 05:17:11.500
# 1  20160508_0000004        4.0       2.0        2.0       2.0  2016-05-08 01:45:30.300

# channels_df (index not shown):
# columns:
# id  station_id  latitude  longitude  datacenter_id start_time end_time network station location channel
# data (not aligned with columns):
# 1   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHE
# 2   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHN
# 3   1  1.0   1.0   1 2003-01-01 NaT  GE  FLT1    HHZ
# 4   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c1
# 5   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c2
# 6   2  90.0  90.0  1 2009-01-01 NaT  n1  s       c3
# 7   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHE
# 8   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHN
# 9   3  1.0   1.0   2 2003-01-01 NaT  IA  BAKI    BHZ
# 10  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c1
# 11  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c2
# 12  4  90.0  90.0  2 2009-01-01 NaT  n2  s       c3

        assert all(_ in channels_df.columns for _ in [Station.network.key, Station.station.key,
                                                      Channel.location.key, Channel.channel.key])
        chaid2mseedid = chaid2mseedid_dict(channels_df)
        # check that we removed the columns:
        assert not any(_ in channels_df.columns for _ in
                       [Station.network.key, Station.station.key,
                        Channel.location.key, Channel.channel.key])

        # take all segments:
        # use minmag and maxmag
        ttable = tt_ak135_tts
        segments_df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                            minmag_radius=10, maxmag_radius=10), tttable=ttable)

        assert len(pd.unique(segments_df['arrival_time'])) == 2

        h = 9

# segments_df (index not shown). Note that
# cid sid did n   s    l  c    ed   event_id          depth_km                time  <- LAST TWO ARE Event related columns that will be removed after arrival_time calculations
# 1   1   1   GE  FLT1    HHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 2   1   1   GE  FLT1    HHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 3   1   1   GE  FLT1    HHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 7   3   2   IA  BAKI    BHE  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 8   3   2   IA  BAKI    BHN  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 9   3   2   IA  BAKI    BHZ  0.0  20160508_0000129  60.0 2016-05-08 05:17:11.500
# 4   2   1   n1  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 5   2   1   n1  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 6   2   1   n1  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 10  4   2   n2  s       c1   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 11  4   2   n2  s       c2   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300
# 12  4   2   n2  s       c3   0.0  20160508_0000004  2.0  2016-05-08 01:45:30.300

# LEGEND:
# cid = channel_id
# sid = station_id
# scid = datacenter_id
# n, s, l, c = network, station, location, channel
# ed = event_distance_deg

        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)

        wtimespan = [1,2]
        expected = len(segments_df)  # no segment on db, we should have all segments to download
        orig_segments_df = segments_df.copy()
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

# segments_df
# COLUMNS:
# channel_id  datacenter_id network station location channel event_distance_deg event_id arrival_time start_time end_time id download_status_code run_id
# DATA (not aligned with columns):
#               channel_id  datacenter_id network station location channel  event_distance_deg  event_id            arrival_time          start_time            end_time    id download_status_code  run_id
# GE.FLT1..HHE  1           1              GE      FLT1             HHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHN  2           1              GE      FLT1             HHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# GE.FLT1..HHZ  3           1              GE      FLT1             HHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHE  7           2              IA      BAKI             BHE     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHN  8           2              IA      BAKI             BHN     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# IA.BAKI..BHZ  9           2              IA      BAKI             BHZ     0.0                 1        2016-05-08 05:17:12.500 2016-05-08 05:16:12 2016-05-08 05:19:12  None  None                 1
# n1.s..c1      4           1              n1      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c2      5           1              n1      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n1.s..c3      6           1              n1      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c1      10          2              n2      s                c1      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c2      11          2              n2      s                c2      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1
# n2.s..c3      12          2              n2      s                c3      0.0                 2        2016-05-08 01:45:31.300 2016-05-08 01:44:31 2016-05-08 01:47:31  None  None                 1

        # self._segdata is the folder file of a "valid" 3-channel miniseed
        # The channels are:
        # Thus, no match will be found and all segments will be written with a None
        # download status code

        # setup urlread: first three rows: ok
        # rows[3:6]: 413, retry them
        # rows[6:9]: malformed_data
        # rows[9:12] 413, retry them
        # then retry:
        # rows[3]: empty_data
        # rows[4]: data_with_gaps (but seed_id should notmatch)
        # rows[5]: data_with_gaps (seed_id should notmatch)
        # rows[9]: URLError
        # rows[10]: Http 500 error
        # rows[11]: 413

        # NOTE THAT THIS RELIES ON THE FACT THAT THREADS ARE EXECUTED IN THE ORDER OF THE DATAFRAME
        # WHICH SEEMS TO BE THE CASE AS THERE IS ONE SINGLE PROCESS
        # self._seg_data[:2] is a way to mock data corrupted
        urlread_sideeffect = [self._seg_data, 413, self._seg_data[:2], 413,
                              '', self._seg_data_gaps, self._seg_data_gaps,
                              URLError("++urlerror++"), 500, 413]
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.id, Segment.channel_id, Segment.datacenter_id,
                Segment.download_code, Segment.maxgap_numsamples, \
                Segment.sample_rate, Segment.data_seed_id, Segment.data, Segment.download_id,
                Segment.request_start, Segment.request_end, Segment.start_time, Segment.end_time
                ]
        db_segments_df = dbquery2df(db.session.query(*cols))
        assert Segment.download_id.key in db_segments_df.columns

        # change data column otherwise we cannot display db_segments_df.
        # When there is data just print "data"
        db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                           (db_segments_df[Segment.data.key].str.len() > 0),
                           Segment.data.key] = b'data'

        # assert we have 4 segments with "data" properly set:
        assert len(db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
                                      (db_segments_df[Segment.data.key].str.len() > 0),
                                      Segment.data.key]) == 4

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

# db_segments_df:
#    id  channel_id  datacenter_id  download_status_code  max_gap_ovlap_ratio  sample_rate data_seed_id     data  run_id          start_time            end_time
# 0  1   1           1              200.0                 0.0001               100.0        GE.FLT1..HHE    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 1  2   2           1              200.0                 0.0001               100.0        GE.FLT1..HHN    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 2  3   3           1              200.0                 0.0001               100.0        GE.FLT1..HHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 6  7   7           2              200.0                 NaN                  NaN          None                  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 7  8   8           2              NaN                   NaN                  NaN          None            None  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 8  9   9           2              200.0                 20.0                 20.0         IA.BAKI..BHZ    data  1      2016-05-08 05:16:12 2016-05-08 05:19:12
# 3  4   4           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 4  5   5           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 5  6   6           1             -2.0                   NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 9  10  10          2              -1.0                  NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 10 11  11          2              500.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31
# 11 12  12          2              413.0                 NaN                  NaN          None            None  1      2016-05-08 01:44:31 2016-05-08 01:47:31

        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)
        assert mock_updatedf.call_count == 0

        dsc = db_segments_df[Segment.download_code.key]
        exp_dsc = np.array([200, 200, 200, 200, np.nan, 200, -2, -2, -2, -1, 500, 413])
        assert ((dsc == exp_dsc) | (np.isnan(dsc) & np.isnan(exp_dsc))).all()
        # as we have 12 segments and a buf size of self.db_buf_size(=1, but it might change in the
        # future), this below is two
        # it might change if we changed the buf size in the future

        # test that we correctly called mock_insertdf. Note that we assume that the
        # latter is called ONLY inside DbManager. To test that, as the number of stuff
        # to be added (length of the dataframes) varies, we need to implement a counter here:
        mock_insertdf_call_count = 0
        _bufzise = 0
        for c in mock_insertdf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_insertdf_call_count += 1
                _bufzise = 0

        assert mock_insertdf.call_count == mock_insertdf_call_count

        # assert data is consistent
        COL = Segment.data.key
        assert (db_segments_df.iloc[:3][COL] == b'data').all()
        assert (db_segments_df.iloc[3:4][COL] == b'').all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == b'data').all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # assert downdload status code is consistent
        URLERR_CODE, MSEEDERR_CODE = s2scodes.url_err, s2scodes.mseed_err

        # also this asserts that we grouped for dc starttime endtime
        COL = Segment.download_code.key
        assert (db_segments_df.iloc[:4][COL] == 200).all()
        assert pd.isnull(db_segments_df.iloc[4:5][COL]).all()
        assert (db_segments_df.iloc[5:6][COL] == 200).all()
        assert (db_segments_df.iloc[6:9][COL] == MSEEDERR_CODE).all()
        assert (db_segments_df.iloc[9][COL] == URLERR_CODE).all()
        assert (db_segments_df.iloc[10][COL] == 500).all()
        assert (db_segments_df.iloc[11][COL] == 413).all()

        # assert gaps are only in the given position
        COL = Segment.maxgap_numsamples.key
        assert (db_segments_df.iloc[:3][COL] < 0.01).all()
        assert pd.isnull(db_segments_df.iloc[3:5][COL]).all()
        assert (db_segments_df.iloc[5][COL] == 20).all()
        assert pd.isnull(db_segments_df.iloc[6:][COL]).all()

        # now mock retry:
        segments_df, request_timebounds_need_update = \
            prepare_for_download(db.session, orig_segments_df, dc_dataselect_manager, wtimespan,
                                 retry_seg_not_found=True,
                                 retry_url_err=True,
                                 retry_mseed_err=True,
                                 retry_client_err=True,
                                 retry_server_err=True,
                                 retry_timespan_err=True,
                                 retry_timespan_warn=True)

        assert request_timebounds_need_update is False

        COL = Segment.download_code.key
        mask = (db_segments_df[COL] >= 400) | pd.isnull(db_segments_df[COL]) \
            | (db_segments_df[COL].isin([URLERR_CODE, MSEEDERR_CODE]))
        assert len(segments_df) == len(db_segments_df[mask])

        urlread_sideeffect = [413]
        mock_updatedf.reset_mock()
        mock_insertdf.reset_mock()
        # define a dc_dataselect_manager for open data only:
        dc_dataselect_manager = DcDataselectManager(datacenters_df, Authorizer(None), False)
        # Let's go:
        ztatz = self.download_save_segments(urlread_sideeffect, db.session, segments_df,
                                            dc_dataselect_manager,
                                            chaid2mseedid,
                                            self.run.id, False,
                                            request_timebounds_need_update,
                                            1, 2, 3, db_bufsize=self.db_buf_size)
        # get columns from db which we are interested on to check
        cols = [Segment.download_code, Segment.channel_id]
        db_segments_df = dbquery2df(db.session.query(*cols))

        # change data column otherwise we cannot display db_segments_df. When there is data
        # just print "data"
        # db_segments_df.loc[(~pd.isnull(db_segments_df[Segment.data.key])) &
        # (db_segments_df[Segment.data.key].str.len() > 0), Segment.data.key] = b'data'

        # re-sort db_segments_df to match the segments_df:
        ret = []
        for cha in segments_df[Segment.channel_id.key]:
            ret.append(db_segments_df[db_segments_df[Segment.channel_id.key] == cha])
        db_segments_df = pd.concat(ret, axis=0)

        assert (db_segments_df[COL] == 413).all()
        assert len(ztatz) == len(datacenters_df)
        assert len(db_segments_df) == len(segments_df)

        # same as above: but with updatedf: test that we correctly called mock_insertdf_napkeys.
        # Note that we assume that the latter is called ONLY inside download.main.DbManager.
        # To test that, as the number of stuff to be added (length of the dataframes) varies,
        # we need to implement a counter here:
        mock_updatedf_call_count = 0
        _bufzise = 0
        for c in mock_updatedf.call_args_list:
            c_args = c[0]
            df_ = c_args[0]
            _bufzise += len(df_)
            if _bufzise >= self.db_buf_size:
                mock_updatedf_call_count += 1
                _bufzise = 0

        assert mock_updatedf.call_count == mock_updatedf_call_count

        assert mock_insertdf.call_count == 0
예제 #5
0
def run(session,
        download_id,
        eventws,
        starttime,
        endtime,
        dataws,
        eventws_params,
        network,
        station,
        location,
        channel,
        min_sample_rate,
        search_radius,
        update_metadata,
        inventory,
        timespan,
        retry_seg_not_found,
        retry_url_err,
        retry_mseed_err,
        retry_client_err,
        retry_server_err,
        retry_timespan_err,
        tt_table,
        advanced_settings,
        authorizer,
        isterminal=False):
    """
        Downloads waveforms related to events to a specific path.
        This function is not intended to be called directly (PENDING: update doc?)

        :raise: :class:`FailedDownload` exceptions
    """

    # RAMAINDER: **Any function here EXPECTS THEIR DATAFRAME INPUT TO BE NON-EMPTY.**
    # Thus, any function returning a dataframe is responsible to return well formed (non empty)
    # data frames: if it would not be the case, the function should raise either:
    # 1) a NothingToDownload to stop the routine silently (log to info) and proceed to the
    # inventories (if set),
    # 2) a Faileddownload to stop the download immediately and raise the exception

    dbbufsize = advanced_settings['db_buf_size']
    update_md_only = update_metadata == 'only'
    if update_md_only:
        update_metadata = True

    process = psutil.Process(os.getpid()) if isterminal else None
    # calculate steps (note that bool math works, e.g: 8 - True == 7):
    __steps = 3 if update_md_only else \
        (6 + inventory + (True if authorizer.token else False))
    stepiter = iter(range(1, __steps + 1))

    # custom function for logging.info different steps:
    def stepinfo(text, *args, **kwargs):
        step = next(stepiter)
        logger.info("\nSTEP %d of %d: {}".format(text), step, __steps, *args,
                    **kwargs)
        if process is not None:
            percent = process.memory_percent()
            logger.warning("(%.1f%% memory used)", percent)

    try:
        if not update_md_only:

            stepinfo("Fetching events")
            events_df = get_events_df(session, eventws, eventws_params,
                                      starttime, endtime, dbbufsize,
                                      advanced_settings['e_timeout'],
                                      isterminal)

        # Get datacenters, store them in the db, returns the dc instances (db rows) correctly
        # added
        stepinfo("Fetching data-centers")
        # get dacatanters (might raise FailedDownload):
        datacenters_df, eidavalidator = \
            get_datacenters_df(session, dataws, advanced_settings['routing_service_url'],
                               network, station, location, channel, starttime, endtime,
                               dbbufsize)

        stepinfo("Fetching stations and channels from %d %s",
                 len(datacenters_df),
                 "data-center" if len(datacenters_df) == 1 else "data-centers")
        # get dacatanters (might raise FailedDownload):
        channels_df = get_channels_df(
            session, datacenters_df, eidavalidator, network, station, location,
            channel, starttime, endtime, min_sample_rate, update_metadata,
            advanced_settings['max_thread_workers'],
            advanced_settings['s_timeout'],
            advanced_settings['download_blocksize'], dbbufsize, isterminal)

        if not update_md_only:
            # get channel id to mseed id dict and purge channels_df
            # the dict will be used to download the segments later, but we use it now to drop
            # unnecessary columns and save space (and time)
            chaid2mseedid = chaid2mseedid_dict(channels_df,
                                               drop_mseedid_columns=True)

            stepinfo("Selecting stations within search area from %d events",
                     len(events_df))
            # merge vents and stations (might raise FailedDownload):
            segments_df = merge_events_stations(events_df, channels_df,
                                                search_radius, tt_table,
                                                isterminal)
            # help gc by deleting the (only) refs to unused dataframes
            del events_df
            del channels_df

            if authorizer.token:
                stepinfo(
                    "Acquiring credentials from token in order to download restricted data"
                )
            dc_dataselect_manager = DcDataselectManager(
                datacenters_df, authorizer, isterminal)

            stepinfo("%d segments found. Checking already downloaded segments",
                     len(segments_df))
            # raises NothingToDownload
            segments_df, request_timebounds_need_update = \
                prepare_for_download(session, segments_df, dc_dataselect_manager,
                                     timespan, retry_seg_not_found,
                                     retry_url_err, retry_mseed_err, retry_client_err,
                                     retry_server_err, retry_timespan_err,
                                     retry_timespan_warn=False)

            # prepare_for_download raises a NothingToDownload if there is no data, so if we are
            # here segments_df is not empty
            stepinfo(
                "Downloading %d segments %sand saving to db", len(segments_df),
                '(open data only) '
                if dc_dataselect_manager.opendataonly else '')
            # frees memory. Although maybe unecessary, let's do our best to free stuff cause the
            # next one is memory consuming:
            # https://stackoverflow.com/questions/30021923/how-to-delete-a-sqlalchemy-mapped-object-from-memory
            session.expunge_all()
            session.close()

            d_stats = download_save_segments(
                session, segments_df, dc_dataselect_manager, chaid2mseedid,
                download_id, update_metadata, request_timebounds_need_update,
                advanced_settings['max_thread_workers'],
                advanced_settings['w_timeout'],
                advanced_settings['download_blocksize'], dbbufsize, isterminal)
            del segments_df  # help gc?
            session.close()  # frees memory?
            logger.info("")
            logger.info(
                ("** Segments download summary **\n"
                 "Number of segments per data center url (row) and response "
                 "type (column):\n%s") % str(d_stats) or "Nothing to show")

    except NothingToDownload as ntdexc:
        # we are here if some function raised a NothingToDownload (e.g., in prepare_for_download
        # there is nothing according to current config). Print message as info, not that
        # inventory might be downloaded (see finally below)
        logger.info(str(ntdexc))
        # comment out: DO NOT RAISE:
        # raise
    except FailedDownload as dexc:
        # We are here if we raised a FailedDownload. Same behaviour as NothingToDownload,
        # except we log an error message, and we prevent downloading inventories by forcing
        # the flag to be false
        inventory = False
        logger.error(dexc)
        raise
    except:  # @IgnorePep8
        inventory = False
        raise
    finally:
        if inventory:
            # frees memory. Although maybe unecessary, let's do our best to free stuff cause the
            # next one might be memory consuming:
            # https://stackoverflow.com/questions/30021923/how-to-delete-a-sqlalchemy-mapped-object-from-memory
            session.expunge_all()
            session.close()

            # query station id, network station, datacenter_url
            # for those stations with empty inventory_xml
            # AND at least one segment non empty/null
            # Download inventories for those stations only
            sta_df = dbquery2df(
                query4inventorydownload(session, update_metadata))
            # stations = session.query(Station).filter(~withdata(Station.inventory_xml)).all()
            if sta_df.empty:
                stepinfo("Skipping: No station inventory to download")
            else:
                stepinfo("Downloading %d station inventories", len(sta_df))
                n_downloaded, n_empty, n_errors = \
                    save_inventories(session, sta_df,
                                     advanced_settings['max_thread_workers'],
                                     advanced_settings['i_timeout'],
                                     advanced_settings['download_blocksize'], dbbufsize,
                                     isterminal)
                logger.info(("** Station inventories download summary **\n"
                             "- downloaded     %7d \n"
                             "- discarded      %7d (empty response)\n"
                             "- not downloaded %7d (client/server errors)"),
                            n_downloaded, n_empty, n_errors)
예제 #6
0
    def test_merge_event_stations_mag_independent_circle(self, db, tt_ak135_tts):
        # get events with lat lon (1,1), (2,2,) ... (n, n)
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|90|90|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)

        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(None, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)

        # url read for channels: Note: first response data raises, second has an error and
        # that error is skipped (other channels are added), and last two channels are from two
        # stations (BLA|BLA|...) with only different start time (thus stations should both be
        # added)
        urlread_sideeffect  = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|
A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""",
"""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00
BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00|
""",  URLError('wat'), socket.timeout()]

        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 5
        # events_df
#    id  magnitude  latitude  longitude  depth_km                    time
# 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
# 1  2   4.0        90.0      90.0       2.0      2016-05-08 01:45:30.300

    # channels_df:
#     id station_id  latitude  longitude  datacenter_id start_time   end_time
# 0   1           1       1.0        1.0              1 2008-02-12        NaT
# 1   2           2       2.0        2.0              1 2009-01-01        NaT
# 2   3           3       3.0        3.0              2 2008-02-12        NaT
# 3   4           4       7.0        7.0              2 2009-01-01 2019-01-01
# 4   5           5       8.0        8.0              2 2019-01-01        NaT


        tt_table = tt_ak135_tts
        # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200
        # we have only magnitudes <10, we have two events exactly on a station (=> dist=0)
        # which will be taken (the others dropped out)
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=10),
                                   tttable=tt_table)
        # the first event results in 4 potential segments
        # (the last channel has been opened too late),
        # the second event results in 0 potential segments
        # (too far away):
        assert len(df) == 4

        # now let's see: the channel with id = 4 is 8.48 degrees far away
        # from the first event. By issuing a max=8:
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=8),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 3

        # now let'se restrict again:search_radius  min is increased to 2, meaning that
        # we skip the first two channels (distances =0 and 1.413, respectively)
        # and leaving us with 3-2 = 1 potential segment only:
        df = merge_events_stations(events_df, channels_df, dict(min=1.414, max=8),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 1

        # now let's take all combinations (2 events x 4 channels = 8 potential segments).
        df = merge_events_stations(events_df, channels_df, dict(min=0, max=90),
                                   tttable=tt_table)
        # we should get:
        assert len(df) == 8
예제 #7
0
    def test_merge_event_stations(self, db, tt_ak135_tts):
        # get events with lat lon (1,1), (2,2,) ... (n, n)
        urlread_sideeffect = """#EventID | Time | Latitude | Longitude | Depth/km | Author | Catalog | Contributor | ContributorID | MagType | Magnitude | MagAuthor | EventLocationName
20160508_0000129|2016-05-08 05:17:11.500000|1|1|60.0|AZER|EMSC-RTS|AZER|505483|ml|3|AZER|CASPIAN SEA, OFFSHR TURKMENISTAN
20160508_0000004|2016-05-08 01:45:30.300000|2|2|2.0|EMSC|EMSC-RTS|EMSC|505183|ml|4|EMSC|CROATIA
"""
        events_df = self.get_events_df(urlread_sideeffect, db.session)

        net, sta, loc, cha = [], [], [], []
        datacenters_df, eidavalidator = \
            self.get_datacenters_df(None, db.session, None, self.routing_service,
                                    net, sta, loc, cha, db_bufsize=self.db_buf_size)

        # url read for channels: Note: first response data raises, second has an error and
        # that error is skipped (other channels are added), and last two channels are from two
        # stations (BLA|BLA|...) with only different start time (thus stations should both be
        # added)
        urlread_sideeffect  = ["""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|a||HHZ|1|1|622.0|0.0|0.0|-90.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|50.0|2008-02-12T00:00:00|
A|b||HHE|2|2|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|
""",
"""#Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
A|c||HHZ|3|3|622.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2008-02-12T00:00:00|
BLA|e||HHZ|7|7|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2009-01-01T00:00:00|2019-01-01T00:00:00
BLA|e||HHZ|8|8|485.0|0.0|90.0|0.0|GFZ:HT1980:CMG-3ESP/90/g=2000|838860800.0|0.1|M/S|100.0|2019-01-01T00:00:00|
""",  URLError('wat'), socket.timeout()]

        channels_df = self.get_channels_df(urlread_sideeffect, db.session,
                                           datacenters_df,
                                           eidavalidator,
                                           net, sta, loc, cha, None, None, 10,
                                           False, None, None, -1, self.db_buf_size)
        assert len(channels_df) == 5

    # events_df
#    id  magnitude  latitude  longitude  depth_km                    time
# 0  1   3.0        1.0       1.0        60.0     2016-05-08 05:17:11.500
# 1  2   4.0        2.0       2.0       2.0      2016-05-08 01:45:30.300

    # channels_df:
#     id station_id  latitude  longitude  datacenter_id start_time   end_time
# 0   1           1       1.0        1.0              1 2008-02-12        NaT
# 1   2           2       2.0        2.0              1 2009-01-01        NaT
# 2   3           3       3.0        3.0              2 2008-02-12        NaT
# 3   4           4       7.0        7.0              2 2009-01-01 2019-01-01
# 4   5           5       8.0        8.0              2 2019-01-01        NaT

        tt_table = tt_ak135_tts
        # for magnitude <10, max_radius is 0. For magnitude >10, max_radius is 200
        # we have only magnitudes <10, we have two events exactly on a station (=> dist=0)
        # which will be taken (the others dropped out)
        df = merge_events_stations(events_df, channels_df, dict(minmag=10, maxmag=10,
                                   minmag_radius=0, maxmag_radius=200), tttable=tt_table)

        assert len(df) == 2

        # for magnitude <1, max_radius is 100. For magnitude >1, max_radius is 200
        # we have only magnitudes <10, we have all event-stations closer than 100 deg
        # So we might have ALL channels taken BUT: one station start time is in 2019, thus
        # it will not fall into the case above!
        df = merge_events_stations(events_df, channels_df, dict(minmag=1, maxmag=1,
                                   minmag_radius=100, maxmag_radius=2000), tttable=tt_table)

        assert len(df) == (len(channels_df)-1) * len(events_df)
        # assert channel outside time bounds was in:
        assert not channels_df[channels_df[Station.start_time.key] ==
                               datetime(2019, 1, 1)].empty
        # we need to get the channel id from channels_df cause in df we removed unnecessary
        # columns (including start end time)
        ch_id = channels_df[channels_df[Station.start_time.key] ==
                            datetime(2019, 1, 1)][Channel.id.key].iloc[0]
        # old Channel.id.key is Segment.channel_id.key in df:
        assert df[df[Segment.channel_id.key] == ch_id].empty

        # this is a more complex case, we want to drop the first event by setting a very low
        # threshold (sraidus_minradius=1) for magnitudes <=3 (the first event magnitude)
        # and maxradius very high for the other event (magnitude=4)
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)

        # assert we have only the second event except the first channel which is from the 1st event.
        # The first event is retrievable by its latitude (2)
        # FIXME: more fine grained tests based on distance?
        evid = events_df[events_df[Event.latitude.key] == 2][Event.id.key].iloc[0]
        assert np.array_equal((df[Segment.event_id.key] == evid),
                              [False, True, True, True, True])

        # test arrival times are properly set: Set all event locations to [0,0] as well
        # as stations locations. This should result in all arrival times equal to event time
        #
        _events_df = events_df
        _channels_df = channels_df
        events_df = events_df.copy()
        events_df.loc[:, Event.latitude.key] = 0
        events_df.loc[:, Event.longitude.key] = 0
        event_ids = pd.unique(events_df[Event.id.key])
        # We have two events, set the depth of the first one to zero the other to 60
        evtid1, evtid2 = event_ids[0], event_ids[1]
        evttime1 = events_df[events_df[Event.id.key] == evtid1][Event.time.key].iloc[0]
        evttime2 = events_df[events_df[Event.id.key] == evtid2][Event.time.key].iloc[0]
        events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 0
        events_df.loc[events_df[Event.id.key] == evtid2, Event.depth_km.key] = 60

        channels_df = channels_df.copy()
        channels_df.loc[:, Station.latitude.key] = 0
        channels_df.loc[:, Station.longitude.key] = 0
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)
        # assert for events of depth 0 arrival times are queal to event times
        assert (df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key]
                == evttime1).all()
        # assert for events of depth > 0 arrival times are GREATER than event times
        assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key]
                > evttime2).all()

        # now set the first event time out-of bounds:
        events_df.loc[events_df[Event.id.key] == evtid1, Event.depth_km.key] = 600000
        df = merge_events_stations(events_df, channels_df, dict(minmag=3, maxmag=4,
                                   minmag_radius=1, maxmag_radius=40), tttable=tt_table)
        # assert for events of depth 0 arrival times are queal to event times
        # as nans are dropped from the returned dataframe, assert we do not have segments with
        # event_id == evtid1:
        assert df[df[Segment.event_id.key] == evtid1][Segment.arrival_time.key].empty
        # still assert for events of depth > 0 arrival times are GREATER than event times
        assert (df[df[Segment.event_id.key] == evtid2][Segment.arrival_time.key] > evttime2).all()