Пример #1
0
 def _func(time, ind, df, st):
     """ return waveforms from df of bulk parameters """
     match_chars = {"*", "?", "[", "]"}
     ar = np.ones(len(ind))  # indices of ind to use to load data
     _t1, _t2 = time[0], time[1]
     df = df[(df.t1 == time[0]) & (df.t2 == time[1])]
     # determine which columns use any matching or other select features
     uses_matches = [_column_contains(df[x], match_chars) for x in NSLC]
     match_ar = np.array(uses_matches).any(axis=0)
     df_match = df[match_ar]
     df_no_match = df[~match_ar]
     # handle columns that need matches (more expensive)
     if not df_match.empty:
         match_bulk = df_match.to_records(index=False)
         mar = np.array(
             [filter_index(ind,
                           *tuple(b)[:4]) for b in match_bulk])
         ar = np.logical_and(ar, mar.any(axis=0))
     # handle columns that do not need matches
     if not df_no_match.empty:
         nslc1 = set(get_seed_id_series(df_no_match))
         nslc2 = get_seed_id_series(ind)
         ar = np.logical_and(ar, nslc2.isin(nslc1))
     # get a list of used traces, combine and trim
     st = obspy.Stream([x for x, y in zip(st, ar) if y])
     return st.slice(starttime=to_utc(_t1), endtime=to_utc(_t2))
Пример #2
0
 def _func(time, ind, df):
     """ return waveforms from df of bulk parameters """
     match_chars = {"*", "?", "[", "]"}
     t1, t2 = time[0], time[1]
     # filter index based on start/end times
     in_time = ~((ind["starttime"] > t2) | (ind["endtime"] < t1))
     ind = ind[in_time]
     # create indices used to load data
     ar = np.ones(len(ind))  # indices of ind to use to load data
     df = df[(df.t1 == time[0]) & (df.t2 == time[1])]
     # determine which columns use any matching or other select features
     uses_matches = [_column_contains(df[x], match_chars) for x in NSLC]
     match_ar = np.array(uses_matches).any(axis=0)
     df_match = df[match_ar]
     df_no_match = df[~match_ar]
     # handle columns that need matches (more expensive)
     if not df_match.empty:
         match_bulk = df_match.to_records(index=False)
         mar = np.array(
             [filter_index(ind,
                           *tuple(b)[:4]) for b in match_bulk])
         ar = np.logical_and(ar, mar.any(axis=0))
     # handle columns that do not need matches
     if not df_no_match.empty:
         nslc1 = set(get_seed_id_series(df_no_match))
         nslc2 = get_seed_id_series(ind)
         ar = np.logical_and(ar, nslc2.isin(nslc1))
     return self._index2stream(ind[ar], t1, t2)
Пример #3
0
 def test_dataframe_missing_columns(self, pick_df):
     """Dataframe without required columns should raise ValueError."""
     new = pick_df.drop(columns=["network", "location"])
     with pytest.raises(ValueError):
         upd.get_seed_id_series(new)
     # But it should work if only the required subset is there
     out = upd.get_seed_id_series(new, subset=["station", "channel"])
     assert len(out) == len(pick_df)
     split = out.str.split(".", expand=True)
     assert (split[0] == pick_df["station"]).all()
     assert (split[1] == pick_df["channel"]).all()
Пример #4
0
    def set_stations(self, stations: fetcher_station_type):
        """
        Set the station state in fetcher.

        Parameters
        ----------
        stations
            Data representing stations, from which a client or dataframe
            can be inferred.
        """
        try:
            self.station_client = get_station_client(stations)
        except TypeError:
            self.station_client = getattr(self, "station_client", None)
        try:
            # since its common for inventories to have far out enddates this
            # can raise a warning. These are safe to ignore.
            with suppress_warnings(category=TimeOverflowWarning):
                self.station_df = stations_to_df(stations)
        except TypeError:
            # if unable to get station info from stations use waveform client
            try:
                self.station_df = stations_to_df(self.waveform_client)
            except TypeError:
                #  if no waveforms try events
                try:
                    self.station_df = stations_to_df(self.event_client)
                except TypeError:
                    self.station_df = None
        # make sure seed_id is set
        if self.station_df is not None:
            self.station_df["seed_id"] = get_seed_id_series(self.station_df)
Пример #5
0
 def _index2stream(self,
                   index,
                   starttime=None,
                   endtime=None,
                   merge=True) -> Stream:
     """ return the waveforms in the index """
     # get abs path to each datafame
     files: pd.Series = (str(self.bank_path) + index.path).unique()
     # make sure start and endtimes are in UTCDateTime
     starttime = to_utc(starttime) if starttime else None
     endtime = to_utc(endtime) if endtime else None
     # iterate the files to read and try to load into waveforms
     kwargs = dict(format=self.format, starttime=starttime, endtime=endtime)
     func = partial(_try_read_stream, **kwargs)
     stt = obspy.Stream()
     chunksize = (len(files) // self._max_workers) or 1
     for st in self._map(func, files, chunksize=chunksize):
         if st is not None:
             stt += st
     # sort out nullish nslc codes
     stt = replace_null_nlsc_codes(stt)
     # filter out any traces not in index (this can happen when files hold
     # multiple traces).
     nslc = set(get_seed_id_series(index))
     stt.traces = [x for x in stt if x.id in nslc]
     # trim, merge, attach response
     stt = self._prep_output_stream(stt, starttime, endtime, merge=merge)
     return stt
Пример #6
0
def archive_to_sds(
    bank: Union[Path, str, "obsplus.WaveBank"],
    sds_path: Union[Path, str],
    starttime: Optional[UTCDateTime] = None,
    endtime: Optional[UTCDateTime] = None,
    overlap: float = 30,
    type_code: str = "D",
    stream_processor: Optional[callable] = None,
):
    """
    Create a seiscomp data structure archive from a waveform source.

    Parameters
    ----------
    bank
        A wavebank or path to such.
    sds_path
        The path for the new sds archive to be created.
    starttime
        If not None, the starttime to convert data from bank.
    endtime
        If not None, the endtime to convert data from bank.
    overlap
        The overlap to use for each file.
    type_code
        The str indicating the datatype.
    stream_processor
        A callable that will take a single stream as input and return a
        a single stream. May return and empty stream to skip a stream.

    Notes
    -----
    see: https://www.seiscomp3.org/doc/applications/slarchive/SDS.html
    """
    sds_path = Path(sds_path)
    # create a fetcher object for yielding continuous waveforms
    bank = obsplus.WaveBank(bank)
    bank.update_index()
    # get starttime/endtimes
    index = bank.read_index()
    ts1 = index.starttime.min() if not starttime else starttime
    t1 = _nearest_day(ts1)
    t2 = to_utc(index.endtime.max() if not endtime else endtime)
    nslcs = get_seed_id_series(index).unique()
    # iterate over nslc and get data for selected channel
    for nslc in nslcs:
        nslc_dict = {n: v for n, v in zip(NSLC, nslc.split("."))}
        # yield waveforms in desired chunks
        ykwargs = dict(starttime=t1,
                       endtime=t2,
                       overlap=overlap,
                       duration=86400)
        ykwargs.update(nslc_dict)
        for st in bank.yield_waveforms(**ykwargs):
            if stream_processor:  # apply stream processor if needed.
                st = stream_processor(st)
            if st:
                path = _get_sds_filename(st, sds_path, type_code, **nslc_dict)
                st.write(str(path), "mseed")
Пример #7
0
def make_origins(
        events: catalog_or_event,
        inventory: obspy.Inventory,
        depth: float = 1.0,
        phase_hints: Optional[Iterable] = ("P", "p"),
) -> catalog_or_event:
    """
    Iterate a catalog or single events and ensure each has an origin.

    If no origins are found for an event, create one with the time set to
    the earliest pick and the location set to the location of the first hit
    station. Events are modified in place.

    This may be useful for location codes that need a starting location.

    Parameters
    ----------
    events
        The events to scan and add origins were necessary.
    inventory
        An inventory object which contains all the stations referenced in
        quakeml elements of events.
    depth
        The default depth for created origins. Should be in meters. See the
        obspy docs for Origin or the quakeml standard for more details.
    phase_hints
        List of acceptable phase hints to use for identifying the earliest
        pick. By default will only search for "P" or "p" phase hints.

    Returns
    -------
    Either a Catalog or Event object (same as input).
    """
    # ensure input is an iterable of events
    cat = [events] if isinstance(events, Event) else events
    # load inv dataframe and make sure it has a seed_id column
    df = obsplus.stations_to_df(inventory)
    nslc_series = get_seed_id_series(df)
    for event in cat:
        if not event.origins:  # make new origin
            picks = event.picks_to_df()
            picks = picks.loc[(~(picks["evaluation_status"] == "rejected"))
                              & (picks["phase_hint"].isin(phase_hints))]
            if not len(picks):
                msg = f"{event} has no acceptable picks to create origin"
                raise ValidationError(msg)
            # get first pick, determine time/station used
            first_pick = picks.loc[picks["time"].idxmin()]
            seed_id = first_pick["seed_id"]
            # find channel corresponding to pick
            df_chan = df[nslc_series == seed_id]
            if not len(df_chan):
                raise ValidationError(f"{seed_id} not found in inventory")
            ser = df_chan.iloc[0]
            # create origin
            ori = _create_first_pick_origin(first_pick, ser, depth=depth)
            event.origins.append(ori)
    return events
Пример #8
0
def _filter_index_to_bulk(time, index_df, bulk_df) -> pd.DataFrame:
    """
    Using an index_df, apply conditions in request_df and return array indicating
    if values in index meet requested conditions.

    Parameters
    ----------
    time
        A tuple of mintime, maxtime
    index_df
        A dataframe indexing a waveform resource. Can be an index of traces
        in a stream or an index from a wavebank.
    bulk_df
        The dataframe containing bulk requests.
    """
    match_chars = {"*", "?", "[", "]"}
    # filter out any index times not in current time pair
    too_late = index_df["starttime"] > time[1]
    too_early = index_df["endtime"] < time[0]
    index_df = index_df[~(too_early | too_late)]
    ar = np.ones(len(index_df))  # indices of ind to use to load data
    # filter out any request times which are not for the current time pair
    is_starttime = bulk_df["starttime"] == time[0]
    is_endtime = bulk_df["endtime"] == time[1]
    bulk_df = bulk_df[is_starttime & is_endtime]
    # determine which columns use matching. These must be handled separately.
    uses_matches = [_column_contains(bulk_df[x], match_chars) for x in NSLC]
    match_ar = np.array(uses_matches).any(axis=0)
    df_match = bulk_df[match_ar]
    df_no_match = bulk_df[~match_ar]
    # handle columns that need matches (more expensive)
    if not df_match.empty:
        match_bulk = df_match.to_records(index=False)
        mar = np.array(
            [filter_index(index_df,
                          *tuple(b)[:4]) for b in match_bulk])
        ar = np.logical_and(ar, mar.any(axis=0))
    # handle columns that do not need matches
    if not df_no_match.empty:
        nslc1 = set(get_seed_id_series(df_no_match))
        nslc2 = get_seed_id_series(index_df)
        ar = np.logical_and(ar, nslc2.isin(nslc1))
    # get a list of used traces, combine and trim
    return index_df[ar]
Пример #9
0
def _get_waveform_df(stream: wave_type) -> pd.DataFrame:
    """
    Convert a stream of sequence of traces into a datframe.

    Parameters
    ----------
    stream
        The streams to index

    Notes
    -----
    This is private because it is probably not quite polished enough to include
    in the public API. More thought is needed how to do this properly.
    """
    stats_columns = list(NSLC) + ["starttime", "endtime", "sampling_rate"]
    trace_contents = [{i: tr.stats[i] for i in stats_columns} for tr in stream]
    df = pd.DataFrame(trace_contents, columns=stats_columns)
    # ensure time(y) columns have proper
    df["starttime"] = to_datetime64(df["starttime"])
    df["endtime"] = to_datetime64(df["endtime"])
    df["sampling_period"] = to_timedelta64(1 / df["sampling_rate"])
    df["seed_id"] = get_seed_id_series(df)
    df["trace"] = [ObjectWrapper(tr) for tr in stream]
    return df
Пример #10
0
 def test_one_subset_raises(self, pick_df):
     """At least two columns are required in subset."""
     with pytest.raises(ValueError):
         upd.get_seed_id_series(pick_df, subset=["network"])
Пример #11
0
 def test_bad_subset(self, pick_df):
     """ Bad subset should raise valuerror."""
     with pytest.raises(ValueError):
         upd.get_seed_id_series(pick_df, subset=["network", "monkey"])
Пример #12
0
 def test_seed_id_basic(self, pick_df):
     """Standard usage."""
     seed = upd.get_seed_id_series(pick_df)
     assert (seed == pick_df["seed_id"]).all()