def multiple_dataloaders_with_covariates(data_with_covariates, request): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 params = request.param params.setdefault("target", "volume") training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx", # weight="weight", group_ids=["agency", "sku"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, add_relative_time_idx=True, **params # fixture parametrization ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates, min_prediction_idx=training.index.time.max() + 1) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_with_covariates(data_with_covariates): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx", target="volume", # weight="weight", group_ids=["agency", "sku"], time_varying_known_reals=["discount"], time_varying_unknown_reals=["volume"], static_categoricals=["agency"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, add_relative_time_idx=True, target_normalizer=GroupNormalizer(groups=["agency", "sku"], coerce_positive=False), ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates, min_prediction_idx=training.index.time.max() + 1) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_fixed_window_without_covariates(): data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=10) validation = data.series.iloc[:2] max_encoder_length = 60 max_prediction_length = 20 training = TimeSeriesDataSet( data[lambda x: ~x.series.isin(validation)], time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], static_categoricals=[], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, time_varying_unknown_reals=["value"], target_normalizer=EncoderNormalizer(), ) validation = TimeSeriesDataSet.from_dataset( training, data[lambda x: x.series.isin(validation)], stop_randomization=True, ) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def make_dataloaders(data_with_covariates, **kwargs): training_cutoff = "2016-09-01" max_encoder_length = 4 max_prediction_length = 3 kwargs.setdefault("target", "volume") kwargs.setdefault("group_ids", ["agency", "sku"]) kwargs.setdefault("add_relative_time_idx", True) kwargs.setdefault("time_varying_unknown_reals", ["volume"]) training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff].copy(), time_idx="time_idx", max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, **kwargs, # fixture parametrization ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates.copy(), min_prediction_idx=training.index.time.max() + 1) train_dataloader = training.to_dataloader(train=True, batch_size=2, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=2, num_workers=0) test_dataloader = validation.to_dataloader(train=False, batch_size=1, num_workers=0) return dict(train=train_dataloader, val=val_dataloader, test=test_dataloader)
def test_model(model_path): """ Tests results of given model on dataset """ DATA_PATH = 'data/data.csv' if not os.path.isfile(FILE_PATH): wget.download(PREPROCESS_URL, FILE_PATH) dataset = pd.read_csv(DATA_PATH) dataset['target'] = dataset['target'].astype(float) dataset['time_idx'] = dataset['time_idx'].astype(int) time_series = TimeSeriesDataSet.load('models/dataset_time_set') validation = TimeSeriesDataSet.from_dataset(time_series, dataset) all_dataloader = validation.to_dataloader(train=False, num_workers=0) model = TemporalFusionTransformer.load_from_checkpoint(model_path) actuals = torch.cat([y[0] for (x, y) in iter(all_dataloader)]) predictions = model.predict(all_dataloader) print(f'test mape is {((actuals - predictions).abs() / actuals).mean()}') print(f' max mape {max(((actuals - predictions).abs() / actuals))}') res = (actuals - predictions).abs() / actuals print(f' max 99 mape {np.quantile(res, .99)}') # print("wynik", res) res = np.array([int(x) for x in res])
def transform_data(self, data, past_lags, index_label, target_label, train_val_split): self.past_lags = past_lags self.oldest_lag = int(max(self.past_lags)) + 1 self.index_label = index_label self.target_label = target_label # External train and validation sets X = data[[index_label]] y = data[[target_label]] self.training = (X.loc[:int(len(data) * train_val_split)], y.loc[:int(len(data) * train_val_split)]) self.validation = (X.loc[int(len(data) * train_val_split):], y.loc[int(len(data) * train_val_split):]) # intern train and validation sets, they use dataloaders to optimize the training routine # time index are epoch values # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") data["time_idx"] = data.index data['group_id'] = 'series' max_prediction_length = self.oldest_lag max_encoder_length = self.oldest_lag # training_cutoff = data["time_idx"].max() - max_prediction_length self.intern_training = TimeSeriesDataSet( data[:int(len(data) * train_val_split)], time_idx="time_idx", group_ids=["group_id"], target=self.target_label, min_encoder_length=0, max_encoder_length=max_encoder_length, min_prediction_length=1, max_prediction_length=max_prediction_length, static_categoricals=["group_id"], # time_varying_unknown_reals=[self.target_label], # the docs says that the max_lag < max_encoder_length # lags={self.target_label: list(self.past_lags[1:-1] + 1)}, add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, # allow_missings=True ) # create validation set (predict=True) which means to predict the last max_prediction_length points in time # for each series self._intern_validation = TimeSeriesDataSet.from_dataset( self.intern_training, data, predict=True, stop_randomization=True) # store the last input to use as encoder data to next predictions self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy()
def save_time_series(self): """ Download preprocessing file and creates data in a format suited for temporal fusion """ PREPROCESS_URL = 'https://raw.githubusercontent.com/AWarno/CancerOptimization/main/preprocess_data.py' FILE_PATH = 'data/preprocess_data.py' DATA_PATH = 'data/data.csv' FEATURES = ['dose', 'time'] GROUP_ID = 'series' # Data file already exists so we don't need to generate it if os.path.isfile(DATA_PATH): return # Preprocessing file already exists so we don't need to download it again if not os.path.isfile(FILE_PATH): wget.download(PREPROCESS_URL, FILE_PATH) os.system('python ' + FILE_PATH) dataset = pd.read_csv(DATA_PATH) n = dataset[GROUP_ID].astype(int).max() dataset['target'] = dataset['target'].astype(float) dataset['time_idx'] = dataset['time_idx'].astype(int) training = TimeSeriesDataSet( dataset[dataset[GROUP_ID].apply(lambda x: int(x) < int(n * 0.7))], time_idx='time_idx', target='target', group_ids=[GROUP_ID], min_encoder_length=20, max_encoder_length=20, min_prediction_length=1, max_prediction_length=1, static_categoricals=[], static_reals=[], time_varying_known_categoricals=[], variable_groups={}, time_varying_known_reals=['time_idx'], time_varying_unknown_categoricals=[], time_varying_unknown_reals=['target'] + FEATURES, add_relative_time_idx=True, add_target_scales=False, add_encoder_length=True, categorical_encoders={GROUP_ID: NaNLabelEncoder().fit(dataset.series)}, ) training.save(self.TIMESERIES_PATH)
def get_examples(self, year: int, encoder_length: dict, prediction_length: dict) -> TimeSeriesDataSet: solar_df_filtered = self.solar_df[(self.solar_df['year'] == year)] group_length = 2 * encoder_length['max'] - 1 num_groups = int(np.floor(solar_df_filtered.shape[0] / group_length)) solar_df_filtered = solar_df_filtered[1:(num_groups * group_length + 1)] solar_df_filtered['group'] = np.repeat(np.arange(num_groups), group_length) examples = TimeSeriesDataSet( solar_df_filtered, group_ids=["group"], target="power", time_idx="time_idx", min_encoder_length=encoder_length['min'], max_encoder_length=encoder_length['max'], min_prediction_length=prediction_length['min'], max_prediction_length=prediction_length['max'], time_varying_unknown_reals=["power"], time_varying_known_reals=[ "cloudcover_low", "cloudcover_mid", "cloudcover_high" ], time_varying_known_categoricals=["seasons"], allow_missings=True, ) return examples
def test_predict_dependency(model, dataloaders_with_covariates, data_with_covariates, kwargs): train_dataset = dataloaders_with_covariates["train"].dataset dataset = TimeSeriesDataSet.from_dataset( train_dataset, data_with_covariates[lambda x: x.agency == data_with_covariates.agency.iloc[0]], predict=True ) model.predict_dependency(dataset, variable="discount", values=[0.1, 0.0], **kwargs) model.predict_dependency(dataset, variable="agency", values=data_with_covariates.agency.unique()[:2], **kwargs)
def test_prediction_with_dataloder_raw(data_with_covariates, tmp_path): # tests correct concatenation of raw output test_data = data_with_covariates.copy() np.random.seed(2) test_data = test_data.sample(frac=0.5) dataset = TimeSeriesDataSet( test_data, time_idx="time_idx", max_encoder_length=8, max_prediction_length=10, min_prediction_length=1, min_encoder_length=1, target="volume", group_ids=["agency", "sku"], constant_fill_strategy=dict(volume=0.0), allow_missing_timesteps=True, time_varying_unknown_reals=["volume"], time_varying_known_reals=["time_idx"], target_normalizer=GroupNormalizer(groups=["agency", "sku"]), ) net = TemporalFusionTransformer.from_dataset( dataset, learning_rate=1e-6, hidden_size=4, attention_head_size=1, dropout=0.2, hidden_continuous_size=2, log_interval=1, log_val_interval=1, log_gradient_flow=True, ) logger = TensorBoardLogger(tmp_path) trainer = pl.Trainer(max_epochs=1, gradient_clip_val=1e-6, logger=logger) trainer.fit(net, train_dataloaders=dataset.to_dataloader(batch_size=4, num_workers=0)) # choose small batch size to provoke issue res = net.predict(dataset.to_dataloader(batch_size=2, num_workers=0), mode="raw") # check that interpretation works net.interpret_output(res)["attention"] assert net.interpret_output(res.iget( slice(1)))["attention"].size() == torch.Size( (1, net.hparams.max_encoder_length))
def predict(self, data): """ Transforms data and predicts output based on train model Parameters: self, list of protocols Return: list of results for each protocol based on train model """ print(data) self.save_time_series() dataset = self.prepare_data(data) time_series = TimeSeriesDataSet.load(self.TIMESERIES_PATH) validation = TimeSeriesDataSet.from_dataset(time_series, dataset) val_dataloader = validation.to_dataloader(train=False, num_workers=0) res = self.model.predict(val_dataloader) # print("wynik", res) res = np.array([int(x) for x in res]) return res
def test_dataset(test_data): training = TimeSeriesDataSet( test_data, time_idx="time_idx", target="volume", time_varying_known_reals=["price_regular"], group_ids=["agency", "sku"], static_categoricals=["agency"], max_encoder_length=5, max_prediction_length=2, randomize_length=None, ) return training
def _create_dataset(self, df, valid_p=0.2): df = df_utils.check_dataframe(df) df = self._handle_missing_data(df) df = df[["ds", "y"]] df["time_idx"] = range(df.shape[0]) df["series"] = 0 self.n_data = df.shape[0] self.set_auto_batch_epoch(self.n_data) training_cutoff = df.shape[0] - int(valid_p * df.shape[0]) training = TimeSeriesDataSet( df.iloc[:training_cutoff], time_idx="time_idx", target="y", categorical_encoders={"series": NaNLabelEncoder().fit(df.series)}, group_ids=["series"], min_encoder_length=self.context_length, max_encoder_length=self.context_length, max_prediction_length=self.prediction_length, min_prediction_length=self.prediction_length, time_varying_unknown_reals=["y"], target_normalizer=GroupNormalizer(groups=["series"]), randomize_length=None, add_relative_time_idx=False, add_target_scales=False, ) validation = TimeSeriesDataSet.from_dataset( training, df, min_prediction_idx=training_cutoff) train_dataloader = training.to_dataloader(train=True, batch_size=self.batch_size, num_workers=self.num_workers) val_dataloader = validation.to_dataloader(train=False, batch_size=self.batch_size, num_workers=self.num_workers) return training, train_dataloader, val_dataloader
def load_data( self, data: DataFrame, time_idx: Optional[str] = None, target: Optional[Union[str, List[str]]] = None, group_ids: Optional[List[str]] = None, parameters: Optional[Dict[str, Any]] = None, **time_series_dataset_kwargs: Any, ): if self.training: time_series_dataset = TimeSeriesDataSet( data, time_idx=time_idx, group_ids=group_ids, target=target, **time_series_dataset_kwargs) parameters = time_series_dataset.get_parameters() # Add some sample data so that we can recreate the `TimeSeriesDataSet` later on parameters["data_sample"] = data.iloc[[0]].to_dict() self.parameters = parameters else: if parameters is None: raise MisconfigurationException( "Loading data for evaluation or inference requires parameters from the train data. Either " "construct the train data at the same time as evaluation and inference or provide the train " "`datamodule.parameters` to `from_data_frame` in the `parameters` argument." ) parameters = copy(parameters) parameters.pop("data_sample") time_series_dataset = TimeSeriesDataSet.from_parameters( parameters, data, stop_randomization=True, ) return time_series_dataset
def optimize(cls, dataset: TimeSeriesDataSet, num_steps: int, **kwargs): model = FullyConnectedModelWithCovariates.from_dataset( dataset, **kwargs) dataloader = dataset.to_dataloader() optimizer = torch.optim.Adam(model.parameters(), model.hparams.learning_rate) criteria = torch.nn.L1Loss() for step in range(num_steps): optimizer.zero_grad() x_train, y_train = next(iter(dataloader)) # Forward pass y_pred = model(x_train)['prediction'] # Compute Loss loss = criteria(y_pred, y_train[0]) print('Step {}: train loss: {}'.format(step, loss.item())) # Backward pass loss.backward() optimizer.step() return model
#value=np.arange(30), group=np.repeat(np.arange(3), 10), time_idx=np.tile(np.arange(10), 3), )) test_data # %% from pytorch_forecasting import TimeSeriesDataSet # create the dataset from the pandas dataframe dataset = TimeSeriesDataSet( test_data, group_ids=["group"], target="value", time_idx="time_idx", min_encoder_length=5, max_encoder_length=5, min_prediction_length=2, max_prediction_length=2, time_varying_unknown_reals=["value"], ) # %% dataset.get_parameters() # %% [markdown] # Now, we take a look at the output of the dataloader. It's `x` will be fed to the model's forward method, that is why it is so important to understand it. # %% # convert the dataset to a dataloader dataloader = dataset.to_dataloader(batch_size=4)
max_encoder_length = 150 max_prediction_length = 20 training_cutoff = data["time_idx"].max() - max_prediction_length context_length = max_encoder_length prediction_length = max_prediction_length training = TimeSeriesDataSet( data[lambda x: x.time_idx < training_cutoff], time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], min_encoder_length=context_length, max_encoder_length=context_length, max_prediction_length=prediction_length, min_prediction_length=prediction_length, time_varying_unknown_reals=["value"], randomize_length=None, add_relative_time_idx=False, add_target_scales=False, ) validation = TimeSeriesDataSet.from_dataset(training, data, min_prediction_idx=training_cutoff) batch_size = 128 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=2)
class TFTWrapper(BaseWrapper): def __init__(self, quantiles): super().__init__(quantiles) self.intern_training = None self._intern_validation = None self.training = None self.validation = None self.model = None self.trainer = None self.oldest_lag = None self.last_period = None self.quantiles = quantiles def transform_data(self, data, past_lags, index_label, target_label, train_val_split): self.past_lags = past_lags self.oldest_lag = int(max(self.past_lags)) + 1 self.index_label = index_label self.target_label = target_label # External train and validation sets X = data[[index_label]] y = data[[target_label]] self.training = (X.loc[:int(len(data) * train_val_split)], y.loc[:int(len(data) * train_val_split)]) self.validation = (X.loc[int(len(data) * train_val_split):], y.loc[int(len(data) * train_val_split):]) # intern train and validation sets, they use dataloaders to optimize the training routine # time index are epoch values # data["time_idx"] = (data[self.index_label] - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") data["time_idx"] = data.index data['group_id'] = 'series' max_prediction_length = self.oldest_lag max_encoder_length = self.oldest_lag # training_cutoff = data["time_idx"].max() - max_prediction_length self.intern_training = TimeSeriesDataSet( data[:int(len(data) * train_val_split)], time_idx="time_idx", group_ids=["group_id"], target=self.target_label, min_encoder_length=0, max_encoder_length=max_encoder_length, min_prediction_length=1, max_prediction_length=max_prediction_length, static_categoricals=["group_id"], # time_varying_unknown_reals=[self.target_label], # the docs says that the max_lag < max_encoder_length # lags={self.target_label: list(self.past_lags[1:-1] + 1)}, add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, # allow_missings=True ) # create validation set (predict=True) which means to predict the last max_prediction_length points in time # for each series self._intern_validation = TimeSeriesDataSet.from_dataset( self.intern_training, data, predict=True, stop_randomization=True) # store the last input to use as encoder data to next predictions self.last_period = data.iloc[-(self.oldest_lag * 2 + 1):].copy() def train( self, max_epochs=25, hidden_size=16, lstm_layers=1, dropout=0.1, attention_head_size=4, reduce_on_plateau_patience=4, hidden_continuous_size=8, learning_rate=1e-3, gradient_clip_val=0.1, ): # configure network and trainer # create dataloaders for model batch_size = 128 train_dataloader = self.intern_training.to_dataloader( train=True, batch_size=batch_size) val_dataloader = self._intern_validation.to_dataloader( train=False, batch_size=batch_size * 10) pl.seed_everything(42) early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min") # lr_logger = LearningRateMonitor() trainer = pl.Trainer( max_epochs=max_epochs, gpus=0, weights_summary=None, gradient_clip_val=gradient_clip_val, # limit_train_batches=30, # coment in for training, running validation every 30 batches # fast_dev_run=True, # comment in to check that networkor dataset has no serious bugs callbacks=[early_stop_callback], ) self.model = TemporalFusionTransformer.from_dataset( self.intern_training, learning_rate=learning_rate, hidden_size=hidden_size, attention_head_size=attention_head_size, dropout=dropout, hidden_continuous_size=hidden_continuous_size, lstm_layers=lstm_layers, output_size=len(self.quantiles), # 3 quantiles by default loss=QuantileLoss(self.quantiles), reduce_on_plateau_patience=reduce_on_plateau_patience, ) # res = trainer.tuner.lr_find( # self.model, # train_dataloader=train_dataloader, # val_dataloaders=val_dataloader, # max_lr=10.0, # min_lr=1e-6, # ) # self.model = TemporalFusionTransformer.from_dataset( # self.intern_training, # learning_rate=res.suggestion(), # using the suggested learining rate # hidden_size=hidden_size, # attention_head_size=attention_head_size, # dropout=dropout, # hidden_continuous_size=hidden_continuous_size, # output_size=len(self.quantiles), # 3 quantiles by default # loss=QuantileLoss(self.quantiles), # reduce_on_plateau_patience=reduce_on_plateau_patience, # ) # fit network trainer.fit( self.model, train_dataloader=train_dataloader, val_dataloaders=val_dataloader, ) def _auto_feed(self, X, future_steps, quantile=False): """ Perform autofeed over the X values to predict the futures steps. """ def append_new_data(cur_X, new_value, date_step): new_date = cur_X[self.index_label].iloc[-1] + date_step new_entry = { self.index_label: new_date, self.target_label: new_value, 'time_idx': cur_X['time_idx'].iloc[-1] + 1, 'group_id': 'series' } return cur_X.append(new_entry, ignore_index=True) # prediction or quantile mode mode = 'quantiles' if quantile else 'prediction' # interval between dates (last two dates in the dataset) cur_X = X.copy() date_step = cur_X[self.index_label].iloc[-1] - \ cur_X[self.index_label].iloc[-2] y = [] # if the future steps is less or equals than the oldest lag the model can predict it by default if future_steps <= self.oldest_lag: predict = self.model.predict(cur_X, mode=mode)[0].numpy().tolist() return predict[:future_steps] else: # short cut the auto feed prediction with more reliable prediction predict = self.model.predict(cur_X, mode=mode)[0].numpy().tolist() for new_value in predict: cur_X = append_new_data(cur_X, new_value, date_step) y = predict for _ in range(self.oldest_lag, future_steps): predict = self.model.predict(cur_X, mode=mode)[0][0] if quantile: y.append(predict.numpy().tolist()) new_value = y[-1][1] # get quantil 0.5 else: y.append(float(predict.numpy())) new_value = y[-1] cur_X = append_new_data(cur_X, new_value, date_step) return y def _verify_target_column(self, data): if not self.target_label in data.columns: data[self.target_label] = 0 def predict(self, X, future_steps, history, quantile=False): predictions = [] self._verify_target_column(X) for i in range(len(X)): X_temp = history.append(X.iloc[:i], ignore_index=True) time_idx = list(range(len(X_temp))) # refact to use real time idx time_idx = [ idx + self.last_period["time_idx"].max() for idx in time_idx ] X_temp[self.index_label] = pd.to_datetime(X_temp[self.index_label]) X_temp[self.index_label] = X_temp[self.index_label].dt.tz_localize( None) X_temp["time_idx"] = time_idx X_temp['group_id'] = 'series' y = self._auto_feed(X_temp, future_steps, quantile) predictions.append(y) return predictions def next(self, X, future_steps, quantile=False): self._verify_target_column(X) # pre-process the data X[self.index_label] = pd.to_datetime(X[self.index_label]) X[self.index_label] = X[self.index_label].dt.tz_localize(None) X['group_id'] = 'series' temp_data = self.last_period.iloc[-(self.oldest_lag + 1):].copy() cur_X = temp_data.append(X, ignore_index=True) time_idx = list(range(len(cur_X))) # refact to use real time idx cur_X["time_idx"] = time_idx cur_X.index = list(range(len(cur_X))) y = self._auto_feed(cur_X, future_steps, quantile) return y
def predict(self, future_dataframe): """ Predicts based on the future_dataframe. Should be called only after make_future_dataframe is called Args: future_dataframe: DataFrame form make_future_dataframe function Returns: forecast dataframe """ if self.fitted is False: log.warning("Model has not been fitted. Predictions will be random.") future_dataframe = future_dataframe.copy(deep=True) testing = TimeSeriesDataSet( future_dataframe, time_idx="time_idx", target="y", categorical_encoders={"series": NaNLabelEncoder().fit(future_dataframe.series)}, group_ids=["series"], min_encoder_length=self.context_length, max_encoder_length=self.context_length, max_prediction_length=self.prediction_length, min_prediction_length=self.prediction_length, time_varying_known_reals=["time_idx"], time_varying_unknown_reals=["y"], target_normalizer=GroupNormalizer(groups=["series"], transformation="softplus", center=False), add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, ) new_raw_predictions, new_x = self.model.predict(testing, mode="raw", return_x=True) y_predicted = self.model.to_prediction(new_raw_predictions).detach().cpu() # [0, : new_x["decoder_lengths"][0]] y_predicted = y_predicted.detach().numpy() def pad_with(vector, pad_width, iaxis, kwargs): pad_value = kwargs.get("padder", np.nan) vector[: pad_width[0]] = pad_value vector[-pad_width[1] :] = pad_value y_pred_padded = np.pad(y_predicted, self.prediction_length, pad_with)[ self.prediction_length : -1, self.prediction_length : -self.prediction_length ] y_pred_padded = np.vstack([np.roll(y_pred_padded[:, i], i, axis=0) for i in range(y_pred_padded.shape[1])]).T result = pd.DataFrame( np.ones(shape=(len(future_dataframe), (2 + self.prediction_length))) * np.nan, columns=["ds", "y"] + [f"yhat{i}" for i in range(1, self.prediction_length + 1)], ) result["ds"] = future_dataframe["ds"] result.loc[: len(future_dataframe) - (self.periods + 1), "y"] = ( future_dataframe["y"].iloc[: len(future_dataframe) - (self.periods)].values ) first_part = result.iloc[: self.context_length] second_part = result.iloc[self.context_length :] second_part.loc[:, [col for col in second_part.columns[2:]]] = y_pred_padded result = pd.concat([first_part, second_part]) for i in range(1, self.prediction_length + 1): result[f"residual{i}"] = result[f"yhat{i}"] - result["y"] return result
data["cumFoam"] = data["cumFoam"] + 1e6 training = TimeSeriesDataSet( data[lambda x: x.date <= 300], time_idx="date", group_ids=["symbol"], target="open", #["open", "high", "low", "volume"], allow_missings=True, #group_ids=["agency", "sku"], min_encoder_length= 50, #max_encoder_length // 2, # allow encoder lengths from 0 to max_prediction_length max_encoder_length=100, #max_encoder_length, min_prediction_length=20, max_prediction_length=20, #max_prediction_length, static_categoricals=["symbol"], #static_reals=["avg_population_2017", "avg_yearly_household_income_2017"], #time_varying_known_categoricals=["special_days", "month"], #variable_groups={"special_days": special_days}, # group of categorical variables can be treated as one variable time_varying_known_reals=meta["knownReals"], #time_varying_unknown_categoricals=[],#meta["features"], time_varying_unknown_reals=meta["unknownReals"], #target_normalizer=NaNLabelEncoder(add_nan=True), target_normalizer=None, #GroupNormalizer( # groups=["symbol"], transformation="softplus", center=False #), # use softplus with beta=1.0 and normalize by group #add_relative_time_idx=True, #add_target_scales=True, add_encoder_length=True, categorical_encoders={"symbol": NaNLabelEncoder(add_nan=True)}) validation = TimeSeriesDataSet.from_dataset(training,
max_encoder_length = 60 max_prediction_length = 20 df_train_nbeats = df_train.copy() df_train_nbeats = df_train_nbeats.reset_index() df_train_nbeats = df_train_nbeats.reset_index() df_train_nbeats["group"] = 0 df_train_nbeats_sub, df_train_nbeats_val = utilities.split_ts(df_train_nbeats) nbeats_training = TimeSeriesDataSet( df_train_nbeats_sub, time_idx="index", target="y", categorical_encoders={ "group": NaNLabelEncoder().fit(df_train_nbeats_sub["group"]) }, group_ids=["group"], time_varying_unknown_reals=["y"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, ) nbeats_validation = TimeSeriesDataSet.from_dataset(nbeats_training, df_train_nbeats_val) # %% batch_size = 128 nbeats_train_dataloader = nbeats_training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) nbeats_val_dataloader = nbeats_validation.to_dataloader(train=False, batch_size=batch_size,
validation = data.series.sample(20) max_encoder_length = 60 max_prediction_length = 20 training_cutoff = data["time_idx"].max() - max_prediction_length training = TimeSeriesDataSet( data[lambda x: ~x.series.isin(validation)], time_idx="time_idx", target="value", categorical_encoders={"series": NaNLabelEncoder().fit(data.series)}, group_ids=["series"], static_categoricals=["static"], min_encoder_length=max_encoder_length, max_encoder_length=max_encoder_length, min_prediction_length=max_prediction_length, max_prediction_length=max_prediction_length, time_varying_unknown_reals=["value"], time_varying_known_reals=["time_idx"], target_normalizer=GroupNormalizer(groups=["series"]), add_relative_time_idx=False, add_target_scales=True, randomize_length=None, ) validation = TimeSeriesDataSet.from_dataset( training, data[lambda x: x.series.isin(validation)], # predict=True, stop_randomization=True, )
training = TimeSeriesDataSet( data[lambda x: x.time_idx <= training_cutoff], time_idx="time_idx", target="volume", group_ids=["agency", "sku"], min_encoder_length=max_encoder_length // 2, # allow encoder lengths from 0 to max_prediction_length max_encoder_length=max_encoder_length, min_prediction_length=1, max_prediction_length=max_prediction_length, static_categoricals=["agency", "sku"], static_reals=["avg_population_2017", "avg_yearly_household_income_2017"], time_varying_known_categoricals=["special_days", "month"], variable_groups={"special_days": special_days}, # group of categorical variables can be treated as one variable time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp", "avg_volume_by_agency", "avg_volume_by_sku", ], target_normalizer=GroupNormalizer( groups=["agency", "sku"], transformation="softplus", center=False ), # use softplus with beta=1.0 and normalize by group add_relative_time_idx=True, add_target_scales=True, add_encoder_length=True, )