def test_resample_timeseries_dataframe(self): with pytest.raises(AssertionError): resample_timeseries_dataframe(self.df, dt_col="z", interval="1D", start_time='1/1/2019', end_time='1/5/2019', merge_mode='max') with pytest.raises(AssertionError): resample_timeseries_dataframe(self.df, dt_col="z", interval="1D", start_time='1/1/2019', end_time='1/5/2019', merge_mode='dummy') with pytest.raises(AssertionError): resample_timeseries_dataframe(self.df, dt_col="a", interval="1D", start_time='1/1/2019', end_time='1/5/2019', merge_mode='max') with pytest.raises(AssertionError): resample_timeseries_dataframe(self.df, dt_col="datetime", interval="1D", start_time='1/10/2019', end_time='1/5/2019', merge_mode='max') res_df = resample_timeseries_dataframe(self.df, dt_col="datetime", interval="3D", start_time='12/20/2018', end_time='1/5/2019', merge_mode='max') assert len(res_df) == 6
def resample(self, interval, start_time, end_time, merge_mode="mean"): ''' resample on an new interval for each univariate time series distinguished by id_col and feature_col. :param interval: pandas offset aliases, indicating time interval of the output dataframe. :param start_time: start time of the output dataframe. :param end_time: end time of the output dataframe. :param merge_mode: if current interval is smaller than output interval, we need to merge the values in a mode. "max", "min", "mean" or "sum" are supported for now. :return: the tsdataset instance. Note: It if preferred to call `impute` right after `resample`. ''' df_list = [] for id_name in self._id_list: df_id = resample_timeseries_dataframe(df=self.df[self.df[self.id_col] == id_name] .drop(self.id_col, axis=1), dt_col=self.dt_col, interval=interval, start_time=start_time, end_time=end_time, merge_mode=merge_mode) df_id[self.id_col] = id_name df_list.append(df_id.copy()) self.df = pd.concat(df_list) return self
def resample(self, interval, start_time=None, end_time=None, merge_mode="mean"): ''' Resample on a new interval for each univariate time series distinguished by id_col and feature_col. :param interval: pandas offset aliases, indicating time interval of the output dataframe. :param start_time: start time of the output dataframe. :param end_time: end time of the output dataframe. :param merge_mode: if current interval is smaller than output interval, we need to merge the values in a mode. "max", "min", "mean" or "sum" are supported for now. :return: the tsdataset instance. ''' self.df = self.df.groupby([self.id_col]) \ .apply(lambda df: resample_timeseries_dataframe(df=df, dt_col=self.dt_col, interval=interval, start_time=start_time, end_time=end_time, id_col=self.id_col, merge_mode=merge_mode)) self._freq = pd.Timedelta(interval) self._freq_certainty = True self.df.reset_index(drop=True, inplace=True) return self
def test_resample_timeseries_dataframe_modes(self): data = { 'data': [ 1, 2, 3], 'datetime': [ "2020-11-09T08", "2020-11-09T09", "2020-11-09T11"]} df = pd.DataFrame(data) df['datetime'] = pd.to_datetime(df['datetime']) res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2H", start_time='2020-11-09T07', end_time='2020-11-09T10', merge_mode='max') assert np.isnan( res_df['data'][0]) and res_df['data'][1] == 2 and res_df['data'][2] == 3 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2H", start_time='2020-11-09T07', end_time='2020-11-09T10', merge_mode='min') assert np.isnan( res_df['data'][0]) and res_df['data'][1] == 1 and res_df['data'][2] == 3 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2H", start_time='2020-11-09T07', end_time='2020-11-09T10', merge_mode='mean') assert np.isnan( res_df['data'][0]) and res_df['data'][1] == 1.5 and res_df['data'][2] == 3 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2H", start_time='2020-11-09T07', end_time='2020-11-09T10', merge_mode='sum') assert np.isnan( res_df['data'][0]) and res_df['data'][1] == 3 and res_df['data'][2] == 3
def test_resample_timeseries_dataframe_ms(self): data = { 'data': [ 1, 2, 3], 'datetime': [ "2020-11-09T07:52:00.007", "2020-11-09T07:52:00.008", "2020-11-09T07:52:00.010"]} df = pd.DataFrame(data) df['datetime'] = pd.to_datetime(df['datetime']) res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2ms", start_time='2020-11-09T07:52:00.005', end_time='2020-11-09T07:52:00.010', merge_mode='max') assert len(res_df) == 4 and res_df['data'].isna().sum() == 1 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2ms", start_time='2020-11-09T07:52:00.006', end_time='2020-11-09T07:52:00.010', merge_mode='max') assert len(res_df) == 3 and res_df['data'].isna().sum() == 0 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2ms", start_time='2020-11-09T07:52:00.007', end_time='2020-11-09T07:52:00.010', merge_mode='max') assert len(res_df) == 3 and res_df['data'].isna().sum() == 0 res_df = resample_timeseries_dataframe( df, dt_col="datetime", interval="2ms", merge_mode='max') assert len(res_df) == 3 and res_df['data'].isna().sum() == 0
def resample(self, interval, start_time=None, end_time=None, merge_mode="mean"): ''' Resample on a new interval for each univariate time series distinguished by id_col and feature_col. :param interval: pandas offset aliases, indicating time interval of the output dataframe. :param start_time: start time of the output dataframe. :param end_time: end time of the output dataframe. :param merge_mode: if current interval is smaller than output interval, we need to merge the values in a mode. "max", "min", "mean" or "sum" are supported for now. :return: the tsdataset instance. ''' assert self._is_pd_datetime,\ "The time series data does not have a Pandas datetime format\ (you can use pandas.to_datetime to convert a string into a datetime format)." from pandas.api.types import is_numeric_dtype type_error_list = [ val for val in self.target_col + self.feature_col if not is_numeric_dtype(self.df[val]) ] try: for val in type_error_list: self.df[val] = self.df[val].astype(np.float32) except Exception: raise RuntimeError( "All the columns of target_col" "and extra_feature_col should be of numeric type.") self.df = self.df.groupby([self.id_col]) \ .apply(lambda df: resample_timeseries_dataframe(df=df, dt_col=self.dt_col, interval=interval, start_time=start_time, end_time=end_time, id_col=self.id_col, merge_mode=merge_mode)) self._freq = pd.Timedelta(interval) self._freq_certainty = True self.df.reset_index(drop=True, inplace=True) return self