Python filter_index 예제들, obsplus.utils.pd.filter_index Python 예제들

예제 #1

0

파일 보기

파일: wavebank.py 프로젝트: seisman/obsplus

 def _func(time, ind, df):
     """ return waveforms from df of bulk parameters """
     match_chars = {"*", "?", "[", "]"}
     t1, t2 = time[0], time[1]
     # filter index based on start/end times
     in_time = ~((ind["starttime"] > t2) | (ind["endtime"] < t1))
     ind = ind[in_time]
     # create indices used to load data
     ar = np.ones(len(ind))  # indices of ind to use to load data
     df = df[(df.t1 == time[0]) & (df.t2 == time[1])]
     # determine which columns use any matching or other select features
     uses_matches = [_column_contains(df[x], match_chars) for x in NSLC]
     match_ar = np.array(uses_matches).any(axis=0)
     df_match = df[match_ar]
     df_no_match = df[~match_ar]
     # handle columns that need matches (more expensive)
     if not df_match.empty:
         match_bulk = df_match.to_records(index=False)
         mar = np.array(
             [filter_index(ind,
                           *tuple(b)[:4]) for b in match_bulk])
         ar = np.logical_and(ar, mar.any(axis=0))
     # handle columns that do not need matches
     if not df_no_match.empty:
         nslc1 = set(get_seed_id_series(df_no_match))
         nslc2 = get_seed_id_series(ind)
         ar = np.logical_and(ar, nslc2.isin(nslc1))
     return self._index2stream(ind[ar], t1, t2)

예제 #2

0

파일 보기

파일: get_waveforms.py 프로젝트: seisman/obsplus

 def _func(time, ind, df, st):
     """ return waveforms from df of bulk parameters """
     match_chars = {"*", "?", "[", "]"}
     ar = np.ones(len(ind))  # indices of ind to use to load data
     _t1, _t2 = time[0], time[1]
     df = df[(df.t1 == time[0]) & (df.t2 == time[1])]
     # determine which columns use any matching or other select features
     uses_matches = [_column_contains(df[x], match_chars) for x in NSLC]
     match_ar = np.array(uses_matches).any(axis=0)
     df_match = df[match_ar]
     df_no_match = df[~match_ar]
     # handle columns that need matches (more expensive)
     if not df_match.empty:
         match_bulk = df_match.to_records(index=False)
         mar = np.array(
             [filter_index(ind,
                           *tuple(b)[:4]) for b in match_bulk])
         ar = np.logical_and(ar, mar.any(axis=0))
     # handle columns that do not need matches
     if not df_no_match.empty:
         nslc1 = set(get_seed_id_series(df_no_match))
         nslc2 = get_seed_id_series(ind)
         ar = np.logical_and(ar, nslc2.isin(nslc1))
     # get a list of used traces, combine and trim
     st = obspy.Stream([x for x, y in zip(st, ar) if y])
     return st.slice(starttime=to_utc(_t1), endtime=to_utc(_t2))

예제 #3

0

파일 보기

    def get_waveforms(
        self,
        network=None,
        station=None,
        location=None,
        channel=None,
        starttime=None,
        endtime=None,
    ) -> obspy.Stream:
        """
        Get waveforms from the cache, read from disk and cache if needed.

        See obplus.WaveBank.get_waveforms for param descriptions.
        """
        filt = filter_index(self.index, network, station, location, channel,
                            starttime, endtime)
        ser = self.index[filt].set_index("unique_key")["st_call"]
        # drop duplicates
        ser = ser[~ser.index.duplicated()]
        # no waveforms found, return empty waveforms
        if not len(ser):
            return obspy.Stream()

        st = reduce(add, (x() for x in ser))
        if starttime is not None or endtime is not None:
            # use start/endtime or set far out constants
            starttime = starttime or 0
            endtime = endtime or 32503680000
            return st.trim(starttime=starttime, endtime=endtime)
        else:
            return st

예제 #4

0

파일 보기

파일: fetcher.py 프로젝트: seisman/obsplus

    def _get_bulk_args(self,
                       starttime=None,
                       endtime=None,
                       **kwargs) -> bulk_waveform_arg_type:
        """
        Get the bulk waveform arguments based on given start/end times.

        This method also takes into account data availability as contained
        in the stations data.

        Parameters
        ----------
        starttime
            Start times for query.
        endtime
            End times for query.

        Returns
        -------
        List of tuples of the form:
            [(network, station, location, channel, starttime, endtime)]
        """
        station_df = self.station_df.copy()
        inv = station_df[filter_index(station_df, **kwargs)]
        # replace None/Nan with larger number
        inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64
        inv["end_date"] = inv["end_date"].astype("datetime64[ns]")
        # get start/end of the inventory
        inv_start = inv["start_date"].min()
        inv_end = inv["end_date"].max()
        # remove station/channels that dont have data for requested time
        min_time = to_datetime64(starttime, default=inv_start).min()
        max_time = to_datetime64(endtime, default=inv_end).max()
        con1, con2 = (inv["start_date"] > max_time), (inv["end_date"] <
                                                      min_time)
        df = inv[~(con1 | con2)].set_index("seed_id")[list(NSLC)]
        if df.empty:  # return empty list if no data found
            return []
        if isinstance(starttime, pd.Series):
            # Have to get clever here to make sure only active stations get used
            # and indices are not duplicated.
            new_start = starttime.loc[set(starttime.index).intersection(
                df.index)]
            new_end = endtime.loc[set(endtime.index).intersection(df.index)]
            df["starttime"] = new_start.loc[~new_start.index.duplicated()]
            df["endtime"] = new_end.loc[~new_end.index.duplicated()]
        else:
            df["starttime"] = starttime
            df["endtime"] = endtime
        # remove any rows that don't have defined start/end times
        out = df[~(df["starttime"].isnull() | df["endtime"].isnull())]
        # ensure we have UTCDateTime objects
        out["starttime"] = [to_utc(x) for x in out["starttime"]]
        out["endtime"] = [to_utc(x) for x in out["endtime"]]
        # convert to list of tuples and return
        return [tuple(x) for x in out.to_records(index=False)]

예제 #5

0

파일 보기

 def test_filter_index(self, crandall_dataset):
     """Tests for filtering index with filter index function."""
     # this is mainly here to test the time filtering, because the bank
     # operations pass this off to the HDF5 kernel.
     index = crandall_dataset.waveform_client.read_index(network="UU")
     t1_ns = int(index["starttime"].astype(np.int64).mean())
     t1 = np.datetime64(t1_ns, "ns")
     t2 = index["endtime"].max()
     kwargs = dict(network="UU", station="*", location="*", channel="*")
     bool_ind = filter_index(index, starttime=t1, endtime=t2, **kwargs)
     assert (~np.logical_not(bool_ind)).any()

예제 #6

0

파일 보기

def stream_bulk_split(st: Stream,
                      bulk: List[waveform_request_type],
                      fill_value: Any = None) -> List[Stream]:
    """
    Split a stream into a list of streams that meet requirements in bulk.

    This is similar to the get_waveforms_bulk methods of waveform_client, but
    rather than merging any overlapping data it is returned in a list of traces.

    Parameters
    ----------
    st
        A stream object
    bulk
        A bulk request. Wildcards not currently supported on str params.
    fill_value
        If not None fill any missing data in time range with this value.

    Returns
    -------
    List of traces, each meeting the corresponding request in bulk.
    """
    # return nothing if empty bulk or stream args
    bulk = _get_bulk(bulk)
    if not bulk or len(st) == 0:
        return []

    # # get dataframe of stream contents
    sdf = _stream_data_to_df(st)
    # iterate stream, return output
    out = []
    for barg in bulk:
        assert len(
            barg) == 6, f"{barg} is not a valid bulk arg, must have len 6"
        need = filter_index(sdf, *barg)
        traces = [tr for tr, bo in zip(st, need) if bo]
        new_st = obspy.Stream(traces)
        t1, t2 = to_utc(barg[-2]), to_utc(barg[-1])
        new = new_st.slice(starttime=t1, endtime=t2)
        # apply fill if needed
        if fill_value is not None:
            new = new.trim(starttime=t1,
                           endtime=t2,
                           fill_value=fill_value,
                           pad=True)
        if new is None or not len(new):
            out.append(obspy.Stream())
            continue
        new = merge_traces(new)
        out.append(new)
    assert len(out) == len(bulk), "output is not the same len as stream list"
    return out

예제 #7

0

파일 보기

def _filter_index_to_bulk(time, index_df, bulk_df) -> pd.DataFrame:
    """
    Using an index_df, apply conditions in request_df and return array indicating
    if values in index meet requested conditions.

    Parameters
    ----------
    time
        A tuple of mintime, maxtime
    index_df
        A dataframe indexing a waveform resource. Can be an index of traces
        in a stream or an index from a wavebank.
    bulk_df
        The dataframe containing bulk requests.
    """
    match_chars = {"*", "?", "[", "]"}
    # filter out any index times not in current time pair
    too_late = index_df["starttime"] > time[1]
    too_early = index_df["endtime"] < time[0]
    index_df = index_df[~(too_early | too_late)]
    ar = np.ones(len(index_df))  # indices of ind to use to load data
    # filter out any request times which are not for the current time pair
    is_starttime = bulk_df["starttime"] == time[0]
    is_endtime = bulk_df["endtime"] == time[1]
    bulk_df = bulk_df[is_starttime & is_endtime]
    # determine which columns use matching. These must be handled separately.
    uses_matches = [_column_contains(bulk_df[x], match_chars) for x in NSLC]
    match_ar = np.array(uses_matches).any(axis=0)
    df_match = bulk_df[match_ar]
    df_no_match = bulk_df[~match_ar]
    # handle columns that need matches (more expensive)
    if not df_match.empty:
        match_bulk = df_match.to_records(index=False)
        mar = np.array(
            [filter_index(index_df,
                          *tuple(b)[:4]) for b in match_bulk])
        ar = np.logical_and(ar, mar.any(axis=0))
    # handle columns that do not need matches
    if not df_no_match.empty:
        nslc1 = set(get_seed_id_series(df_no_match))
        nslc2 = get_seed_id_series(index_df)
        ar = np.logical_and(ar, nslc2.isin(nslc1))
    # get a list of used traces, combine and trim
    return index_df[ar]

예제 #8

0

파일 보기

파일: wavebank.py 프로젝트: seisman/obsplus

    def read_index(
        self,
        network: Optional[str] = None,
        station: Optional[str] = None,
        location: Optional[str] = None,
        channel: Optional[str] = None,
        starttime: Optional[utc_time_type] = None,
        endtime: Optional[utc_time_type] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Return a dataframe of the index, optionally applying filters.

        Parameters
        ----------
        {waveform_params}
        kwargs
            kwargs are passed to pandas.read_hdf function
        """
        self.ensure_bank_path_exists()
        if starttime is not None and endtime is not None:
            if starttime > endtime:
                msg = "starttime cannot be greater than endtime."
                raise ValueError(msg)
        if not self.index_path.exists():
            self.update_index()
        # if no file was created (dealing with empty bank) return empty index
        if not self.index_path.exists():
            return pd.DataFrame(columns=self.index_columns)
        # grab index from cache
        index = self._index_cache(starttime,
                                  endtime,
                                  buffer=self.buffer,
                                  **kwargs)
        # filter and return
        filt = filter_index(index,
                            network=network,
                            station=station,
                            location=location,
                            channel=channel)
        return index[filt]