Exemplo n.º 1
0
 def test_resample_timeseries_dataframe(self):
     with pytest.raises(AssertionError):
         resample_timeseries_dataframe(self.df, dt_col="z",
                                       interval="1D",
                                       start_time='1/1/2019',
                                       end_time='1/5/2019',
                                       merge_mode='max')
     with pytest.raises(AssertionError):
         resample_timeseries_dataframe(self.df, dt_col="z",
                                       interval="1D",
                                       start_time='1/1/2019',
                                       end_time='1/5/2019',
                                       merge_mode='dummy')
     with pytest.raises(AssertionError):
         resample_timeseries_dataframe(self.df, dt_col="a",
                                       interval="1D",
                                       start_time='1/1/2019',
                                       end_time='1/5/2019',
                                       merge_mode='max')
     with pytest.raises(AssertionError):
         resample_timeseries_dataframe(self.df, dt_col="datetime",
                                       interval="1D",
                                       start_time='1/10/2019',
                                       end_time='1/5/2019',
                                       merge_mode='max')
     res_df = resample_timeseries_dataframe(self.df, dt_col="datetime",
                                            interval="3D",
                                            start_time='12/20/2018',
                                            end_time='1/5/2019',
                                            merge_mode='max')
     assert len(res_df) == 6
Exemplo n.º 2
0
    def resample(self, interval, start_time, end_time, merge_mode="mean"):
        '''
        resample on an new interval for each univariate time series distinguished
        by id_col and feature_col.

        :param interval: pandas offset aliases, indicating time interval of the output dataframe.
        :param start_time: start time of the output dataframe.
        :param end_time: end time of the output dataframe.
        :param merge_mode: if current interval is smaller than output interval,
            we need to merge the values in a mode. "max", "min", "mean"
            or "sum" are supported for now.
        :return: the tsdataset instance.

        Note: It if preferred to call `impute` right after `resample`.
        '''
        df_list = []
        for id_name in self._id_list:
            df_id = resample_timeseries_dataframe(df=self.df[self.df[self.id_col] == id_name]
                                                  .drop(self.id_col, axis=1),
                                                  dt_col=self.dt_col,
                                                  interval=interval,
                                                  start_time=start_time,
                                                  end_time=end_time,
                                                  merge_mode=merge_mode)
            df_id[self.id_col] = id_name
            df_list.append(df_id.copy())
        self.df = pd.concat(df_list)
        return self
Exemplo n.º 3
0
    def resample(self, interval, start_time=None, end_time=None, merge_mode="mean"):
        '''
        Resample on a new interval for each univariate time series distinguished
        by id_col and feature_col.

        :param interval: pandas offset aliases, indicating time interval of the output dataframe.
        :param start_time: start time of the output dataframe.
        :param end_time: end time of the output dataframe.
        :param merge_mode: if current interval is smaller than output interval,
            we need to merge the values in a mode. "max", "min", "mean"
            or "sum" are supported for now.

        :return: the tsdataset instance.
        '''

        self.df = self.df.groupby([self.id_col]) \
                         .apply(lambda df: resample_timeseries_dataframe(df=df,
                                                                         dt_col=self.dt_col,
                                                                         interval=interval,
                                                                         start_time=start_time,
                                                                         end_time=end_time,
                                                                         id_col=self.id_col,
                                                                         merge_mode=merge_mode))
        self._freq = pd.Timedelta(interval)
        self._freq_certainty = True
        self.df.reset_index(drop=True, inplace=True)
        return self
Exemplo n.º 4
0
 def test_resample_timeseries_dataframe_modes(self):
     data = {
         'data': [
             1, 2, 3], 'datetime': [
             "2020-11-09T08", "2020-11-09T09", "2020-11-09T11"]}
     df = pd.DataFrame(data)
     df['datetime'] = pd.to_datetime(df['datetime'])
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2H",
         start_time='2020-11-09T07',
         end_time='2020-11-09T10',
         merge_mode='max')
     assert np.isnan(
         res_df['data'][0]) and res_df['data'][1] == 2 and res_df['data'][2] == 3
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2H",
         start_time='2020-11-09T07',
         end_time='2020-11-09T10',
         merge_mode='min')
     assert np.isnan(
         res_df['data'][0]) and res_df['data'][1] == 1 and res_df['data'][2] == 3
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2H",
         start_time='2020-11-09T07',
         end_time='2020-11-09T10',
         merge_mode='mean')
     assert np.isnan(
         res_df['data'][0]) and res_df['data'][1] == 1.5 and res_df['data'][2] == 3
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2H",
         start_time='2020-11-09T07',
         end_time='2020-11-09T10',
         merge_mode='sum')
     assert np.isnan(
         res_df['data'][0]) and res_df['data'][1] == 3 and res_df['data'][2] == 3
Exemplo n.º 5
0
 def test_resample_timeseries_dataframe_ms(self):
     data = {
         'data': [
             1,
             2,
             3],
         'datetime': [
             "2020-11-09T07:52:00.007",
             "2020-11-09T07:52:00.008",
             "2020-11-09T07:52:00.010"]}
     df = pd.DataFrame(data)
     df['datetime'] = pd.to_datetime(df['datetime'])
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2ms",
         start_time='2020-11-09T07:52:00.005',
         end_time='2020-11-09T07:52:00.010',
         merge_mode='max')
     assert len(res_df) == 4 and res_df['data'].isna().sum() == 1
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2ms",
         start_time='2020-11-09T07:52:00.006',
         end_time='2020-11-09T07:52:00.010',
         merge_mode='max')
     assert len(res_df) == 3 and res_df['data'].isna().sum() == 0
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2ms",
         start_time='2020-11-09T07:52:00.007',
         end_time='2020-11-09T07:52:00.010',
         merge_mode='max')
     assert len(res_df) == 3 and res_df['data'].isna().sum() == 0
     res_df = resample_timeseries_dataframe(
         df,
         dt_col="datetime",
         interval="2ms",
         merge_mode='max')
     assert len(res_df) == 3 and res_df['data'].isna().sum() == 0
Exemplo n.º 6
0
    def resample(self,
                 interval,
                 start_time=None,
                 end_time=None,
                 merge_mode="mean"):
        '''
        Resample on a new interval for each univariate time series distinguished
        by id_col and feature_col.

        :param interval: pandas offset aliases, indicating time interval of the output dataframe.
        :param start_time: start time of the output dataframe.
        :param end_time: end time of the output dataframe.
        :param merge_mode: if current interval is smaller than output interval,
            we need to merge the values in a mode. "max", "min", "mean"
            or "sum" are supported for now.

        :return: the tsdataset instance.
        '''
        assert self._is_pd_datetime,\
            "The time series data does not have a Pandas datetime format\
            (you can use pandas.to_datetime to convert a string into a datetime format)."

        from pandas.api.types import is_numeric_dtype
        type_error_list = [
            val for val in self.target_col + self.feature_col
            if not is_numeric_dtype(self.df[val])
        ]
        try:
            for val in type_error_list:
                self.df[val] = self.df[val].astype(np.float32)
        except Exception:
            raise RuntimeError(
                "All the columns of target_col"
                "and extra_feature_col should be of numeric type.")
        self.df = self.df.groupby([self.id_col]) \
            .apply(lambda df: resample_timeseries_dataframe(df=df,
                                                            dt_col=self.dt_col,
                                                            interval=interval,
                                                            start_time=start_time,
                                                            end_time=end_time,
                                                            id_col=self.id_col,
                                                            merge_mode=merge_mode))
        self._freq = pd.Timedelta(interval)
        self._freq_certainty = True
        self.df.reset_index(drop=True, inplace=True)
        return self