def test_roll_timeseries_dataframe(self): x, y = roll_timeseries_dataframe(self.easy_data, lookback=self.lookback, horizon=[1, 3], feature_col=["A"], target_col=["B"]) assert x.shape == (8-self.lookback, self.lookback, 2) assert y.shape == (8-self.lookback, 2, 1) x, y = roll_timeseries_dataframe(self.easy_data, lookback=self.lookback, horizon=4, feature_col=["A", "C"], target_col=["B"]) assert x.shape == (7-self.lookback, self.lookback, 3) assert y.shape == (7-self.lookback, 4, 1) x, y = roll_timeseries_dataframe(self.easy_data, lookback=2, horizon=0, feature_col=[], target_col=["A"]) assert x.shape == (9, 2, 1) assert y is None self.easy_data["A"][0] = None x, y = roll_timeseries_dataframe(self.easy_data, lookback=2, horizon=0, feature_col=[], target_col=["A"]) assert x.shape == (8, 2, 1) assert y is None x, y = roll_timeseries_dataframe(self.easy_data, lookback=2, horizon=2, feature_col=["C"], target_col=["A"]) assert x.shape == (6, 2, 2) assert y.shape == (6, 2, 1)
def roll(self, lookback, horizon, feature_col=None, target_col=None, id_sensitive=False): ''' Sampling by rolling for machine learning/deep learning models. :param lookback: int, lookback value :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is an list, we will sample discretely according to the input list. specially, when `horizon` is set to 0, ground truth will be generated as None. :param feature_col: str or list, indicate the feature col name. Default to None, where we will take all avaliable feature in rolling. :param target_col: str or list, indicate the target col name. Default to None, where we will take all target in rolling. it should be a subset of target_col you used to initialized the tsdataset. :param id_sensitive: bool, if `id_sensitive` is False, we will rolling on each id's sub dataframe and fuse the sampings. The shape of rolling will be x: (num_sample, lookback, num_feature_col) y: (num_sample, horizon, num_target_col) where num_sample is the summation of sample number of each dataframe if `id_sensitive` is True, we will rolling on the wide dataframe whose columns are cartesian product of id_col and feature_col The shape of rolling will be x: (num_sample, lookback, num_feature_col) y: (num_sample, horizon, num_target_col) where num_sample is the sample number of the wide dataframe, num_feature_col is the product of the number of id and the number of feature_col, num_target_col is the product of the number of id and the number of target_col. :return: the tsdataset instance. ''' feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \ else self.feature_col target_col = _to_list(target_col, "target_col") if target_col is not None \ else self.target_col if self.roll_addional_feature: additional_feature_col =\ list(set(feature_col).intersection(set(self.roll_addional_feature))) feature_col =\ list(set(feature_col) - set(self.roll_addional_feature)) self.roll_feature = feature_col + additional_feature_col else: additional_feature_col = None self.roll_feature = feature_col self.roll_target = target_col num_id = len(self._id_list) num_feature_col = len(self.roll_feature) num_target_col = len(self.roll_target) self.id_sensitive = id_sensitive roll_feature_df = None if self.roll_feature_df is None \ else self.roll_feature_df[additional_feature_col] # get rolling result for each sub dataframe rolling_result = [roll_timeseries_dataframe(df=self.df[self.df[self.id_col] == id_name], roll_feature_df=roll_feature_df, lookback=lookback, horizon=horizon, feature_col=feature_col, target_col=target_col) for id_name in self._id_list] # concat the result on required axis concat_axis = 2 if id_sensitive else 0 self.numpy_x = np.concatenate([rolling_result[i][0] for i in range(num_id)], axis=concat_axis) if horizon != 0: self.numpy_y = np.concatenate([rolling_result[i][1] for i in range(num_id)], axis=concat_axis) else: self.numpy_y = None # target first if self.id_sensitive: feature_start_idx = num_target_col*num_id reindex_list = [list(range(i*num_target_col, (i+1)*num_target_col)) + list(range(feature_start_idx+i*num_feature_col, feature_start_idx+(i+1)*num_feature_col)) for i in range(num_id)] reindex_list = functools.reduce(lambda a, b: a+b, reindex_list) self.numpy_x = self.numpy_x[:, :, reindex_list] return self
def roll(self, lookback, horizon, feature_col=None, target_col=None, id_sensitive=False): ''' Sampling by rolling for machine learning/deep learning models. :param lookback: int, lookback value. :param horizon: int or list, if `horizon` is an int, we will sample `horizon` step continuously after the forecasting point. if `horizon` is a list, we will sample discretely according to the input list. specially, when `horizon` is set to 0, ground truth will be generated as None. :param feature_col: str or list, indicates the feature col name. Default to None, where we will take all available feature in rolling. :param target_col: str or list, indicates the target col name. Default to None, where we will take all target in rolling. it should be a subset of target_col you used to initialize the tsdataset. :param id_sensitive: bool, if `id_sensitive` is False, we will rolling on each id's sub dataframe and fuse the sampings. The shape of rolling will be x: (num_sample, lookback, num_feature_col + num_target_col) y: (num_sample, horizon, num_target_col) where num_sample is the summation of sample number of each dataframe if `id_sensitive` is True, we will rolling on the wide dataframe whose columns are cartesian product of id_col and feature_col The shape of rolling will be x: (num_sample, lookback, new_num_feature_col + new_num_target_col) y: (num_sample, horizon, new_num_target_col) where num_sample is the sample number of the wide dataframe, new_num_feature_col is the product of the number of id and the number of feature_col. new_num_target_col is the product of the number of id and the number of target_col. :return: the tsdataset instance. roll() can be called by: >>> # Here is a df example: >>> # id datetime value "extra feature 1" "extra feature 2" >>> # 00 2019-01-01 1.9 1 2 >>> # 01 2019-01-01 2.3 0 9 >>> # 00 2019-01-02 2.4 3 4 >>> # 01 2019-01-02 2.6 0 2 >>> tsdataset = TSDataset.from_pandas(df, dt_col="datetime", >>> target_col="value", id_col="id", >>> extra_feature_col=["extra feature 1", >>> "extra feature 2"]) >>> horizon, lookback = 1, 1 >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=False) >>> x, y = tsdataset.to_numpy() >>> print(x, y) # x = [[[1.9, 1, 2 ]], [[2.3, 0, 9 ]]] y = [[[ 2.4 ]], [[ 2.6 ]]] >>> print(x.shape, y.shape) # x.shape = (2, 1, 3) y.shape = (2, 1, 1) >>> tsdataset.roll(lookback=lookback, horizon=horizon, id_sensitive=True) >>> x, y = tsdataset.to_numpy() >>> print(x, y) # x = [[[ 1.9, 2.3, 1, 2, 0, 9 ]]] y = [[[ 2.4, 2.6]]] >>> print(x.shape, y.shape) # x.shape = (1, 1, 6) y.shape = (1, 1, 2) ''' feature_col = _to_list(feature_col, "feature_col") if feature_col is not None \ else self.feature_col target_col = _to_list(target_col, "target_col") if target_col is not None \ else self.target_col if self.roll_addional_feature: additional_feature_col =\ list(set(feature_col).intersection(set(self.roll_addional_feature))) feature_col =\ list(set(feature_col) - set(self.roll_addional_feature)) self.roll_feature = feature_col + additional_feature_col else: additional_feature_col = None self.roll_feature = feature_col self.roll_target = target_col num_id = len(self._id_list) num_feature_col = len(self.roll_feature) num_target_col = len(self.roll_target) self.id_sensitive = id_sensitive roll_feature_df = None if self.roll_feature_df is None \ else self.roll_feature_df[additional_feature_col] rolling_result =\ self.df.groupby([self.id_col])\ .apply(lambda df: roll_timeseries_dataframe(df=df, roll_feature_df=roll_feature_df, lookback=lookback, horizon=horizon, feature_col=feature_col, target_col=target_col)) # concat the result on required axis concat_axis = 2 if id_sensitive else 0 self.numpy_x = np.concatenate( [rolling_result[i][0] for i in self._id_list], axis=concat_axis).astype(np.float64) if horizon != 0: self.numpy_y = np.concatenate( [rolling_result[i][1] for i in self._id_list], axis=concat_axis).astype(np.float64) else: self.numpy_y = None # target first if self.id_sensitive: feature_start_idx = num_target_col * num_id reindex_list = [ list(range(i * num_target_col, (i + 1) * num_target_col)) + list( range(feature_start_idx + i * num_feature_col, feature_start_idx + (i + 1) * num_feature_col)) for i in range(num_id) ] reindex_list = functools.reduce(lambda a, b: a + b, reindex_list) sorted_index = sorted(range(len(reindex_list)), key=reindex_list.__getitem__) self.numpy_x = self.numpy_x[:, :, sorted_index] # scaler index num_roll_target = len(self.roll_target) repeat_factor = len(self._id_list) if self.id_sensitive else 1 scaler_index = [ self.target_col.index(self.roll_target[i]) for i in range(num_roll_target) ] * repeat_factor self.scaler_index = scaler_index return self