def test_from_dataset_equivalence(test_data): training = TimeSeriesDataSet( test_data[lambda x: x.time_idx < x.time_idx.max() - 1], time_idx="time_idx", target="volume", time_varying_known_reals=["price_regular", "time_idx"], group_ids=["agency", "sku"], static_categoricals=["agency"], max_encoder_length=3, max_prediction_length=2, min_prediction_length=1, min_encoder_length=0, randomize_length=None, add_encoder_length=True, add_relative_time_idx=True, add_target_scales=True, ) validation1 = TimeSeriesDataSet.from_dataset(training, test_data, predict=True) validation2 = TimeSeriesDataSet.from_dataset( training, test_data[lambda x: x.time_idx > x.time_idx.min() + 2], predict=True, ) # ensure validation1 and validation2 datasets are exactly the same despite different data inputs for v1, v2 in zip(iter(validation1.to_dataloader(train=False)), iter(validation2.to_dataloader(train=False))): for k in v1[0].keys(): if isinstance(v1[0][k], (tuple, list)): assert len(v1[0][k]) == len(v2[0][k]) for idx in range(len(v1[0][k])): assert torch.isclose(v1[0][k][idx], v2[0][k][idx]).all() else: assert torch.isclose(v1[0][k], v2[0][k]).all() assert torch.isclose(v1[1][0], v2[1][0]).all()
def test_new_group_ids(test_data, kwargs): """Test for new group ids in dataset""" train_agency = test_data["agency"].iloc[0] train_dataset = TimeSeriesDataSet( test_data[lambda x: x.agency == train_agency], time_idx="time_idx", target="volume", group_ids=["agency", "sku"], max_encoder_length=5, max_prediction_length=2, min_prediction_length=1, min_encoder_length=1, categorical_encoders=dict(agency=NaNLabelEncoder(add_nan=True), sku=NaNLabelEncoder(add_nan=True)), **kwargs, ) # test sampling from training dataset next(iter(train_dataset.to_dataloader())) # create test dataset with group ids that have not been observed before test_dataset = TimeSeriesDataSet.from_dataset(train_dataset, test_data) # check that we can iterate through dataset without error for _ in iter(test_dataset.to_dataloader()): pass
def create_dataset(self, df: pandas.DataFrame) -> Tuple[TimeSeriesDataSet, TimeSeriesDataSet]: data_spec = self.create_data_spec() preprocess_spec = dict( add_relative_time_idx=True, # add as feature add_target_scales=True, # add as feature add_encoder_length=True, # add as feature ) prediction_spec = self.create_prediction_spec() time_index_col = cfg.get("time_index") training_cutoff = df[time_index_col].max() - self.cfg.get("max_prediction_length") trainset = TimeSeriesDataSet( df[lambda x: x.time_idx <= training_cutoff], **data_spec, **preprocess_spec, **prediction_spec, ) # create validation set (predict=True) which means to predict the # last max_prediction_length points in time for each series validset = TimeSeriesDataSet.from_dataset( trainset, df, predict=True, stop_randomization=True ) return trainset, validset
def test_min_prediction_idx(test_dataset, test_data, min_prediction_idx): dataset = TimeSeriesDataSet.from_dataset( test_dataset, test_data, min_prediction_idx=min_prediction_idx, min_encoder_length=1, max_prediction_length=10) for x, _ in iter(dataset.to_dataloader(num_workers=0, batch_size=1000)): assert x["decoder_time_idx"].min() >= min_prediction_idx
def test_from_dataset(test_dataset, test_data): dataset = TimeSeriesDataSet.from_dataset(test_dataset, test_data) check_dataloader_output(dataset, next(iter(dataset.to_dataloader(num_workers=0))))
"avg_max_temp", "avg_volume_by_agency", "avg_volume_by_sku", ], target_normalizer=GroupNormalizer( groups=["agency", "sku"], coerce_positive=1.0 ), # use softplus with beta=1.0 and normalize by group add_relative_time_idx=True, # add as feature add_target_scales=True, # add as feature add_encoder_length=True, # add as feature ) # create validation set (predict=True) which means to predict the # last max_prediction_length points in time for each series validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True) # create dataloaders for model batch_size = 128 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0) #%% """ Training the Temporal Fusion Transformer with PyTorch Lightning """