def test_GroupNormalizer(kwargs, groups): data = pd.DataFrame(dict(a=[1, 1, 2, 2, 3], b=[1.1, 1.1, 1.0, 5.0, 1.1])) defaults = dict(method="standard", transformation=None, center=True, scale_by_group=False) defaults.update(kwargs) kwargs = defaults kwargs["groups"] = groups kwargs["scale_by_group"] = kwargs["scale_by_group"] and len( kwargs["groups"]) > 0 if kwargs.get("transformation") in ["relu", "softplus"]: data.b = data.b - 2.0 normalizer = GroupNormalizer(**kwargs) encoded = normalizer.fit_transform(data["b"], data) test_data = dict( prediction=torch.tensor([encoded[0]]), target_scale=torch.tensor(normalizer.get_parameters([1])).unsqueeze(0), ) if kwargs.get("transformation") in ["relu", "softplus", "log1p"]: assert (normalizer(test_data) >= 0).all(), "Inverse transform should yield only positive values" else: assert torch.isclose( normalizer(test_data), torch.tensor(data.b.iloc[0]), atol=1e-5).all(), "Inverse transform should reverse transform"
def dataloaders_with_covariates(data_with_covariates): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx", target="volume", # weight="weight", group_ids=["agency", "sku"], time_varying_known_reals=["discount"], time_varying_unknown_reals=["volume"], static_categoricals=["agency"], max_encoder_length=max_encoder_length, max_prediction_length=max_prediction_length, add_relative_time_idx=True, target_normalizer=GroupNormalizer(groups=["agency", "sku"], coerce_positive=False), ) validation = TimeSeriesDataSet.from_dataset( training, data_with_covariates, min_prediction_idx=training.index.time.max() + 1) batch_size = 4 train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0) val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) return dict(train=train_dataloader, val=val_dataloader)
def dataloaders_with_covariates(data_with_covariates): return make_dataloaders( data_with_covariates, time_varying_known_reals=["discount"], time_varying_unknown_reals=["volume"], static_categoricals=["agency"], add_relative_time_idx=True, target_normalizer=GroupNormalizer(groups=["agency", "sku"]), )
def dataloaders_with_covariates(data_with_covariates): return make_dataloaders( data_with_covariates.copy(), target="target", time_varying_known_reals=["discount"], time_varying_unknown_reals=["target"], static_categoricals=["agency"], add_relative_time_idx=False, target_normalizer=GroupNormalizer(groups=["agency", "sku"], center=False), )
def dataloaders_with_covariates(data_with_covariates): data_with_covariates["target"] = data_with_covariates["volume"].clip(1e-3, 1.0) return make_dataloaders( data_with_covariates, target="target", time_varying_known_reals=["discount"], time_varying_unknown_reals=["target"], static_categoricals=["agency"], add_relative_time_idx=True, target_normalizer=GroupNormalizer(groups=["agency", "sku"], center=False), )
def test_GroupNormalizer(kwargs, groups): data = pd.DataFrame(dict(a=[1, 1, 2, 2, 3], b=[1.1, 1.1, 1.0, 5.0, 1.1])) defaults = dict( method="standard", log_scale=False, coerce_positive=False, center=True, log_zero_value=0.0, scale_by_group=False ) defaults.update(kwargs) kwargs = defaults kwargs["groups"] = groups kwargs["scale_by_group"] = kwargs["scale_by_group"] and len(kwargs["groups"]) > 0 if kwargs["coerce_positive"] and kwargs["log_scale"]: with pytest.raises(AssertionError): normalizer = GroupNormalizer(**kwargs) else: if kwargs["coerce_positive"]: data.b = data.b - 2.0 normalizer = GroupNormalizer(**kwargs) encoded = normalizer.fit_transform(data["b"], data) test_data = dict( prediction=torch.tensor([encoded.iloc[0]]), target_scale=torch.tensor(normalizer.get_parameters([1])).unsqueeze(0), ) if kwargs["coerce_positive"]: assert (normalizer(test_data) >= 0).all(), "Inverse transform should yield only positive values" else: assert torch.isclose( normalizer(test_data), torch.tensor(data.b.iloc[0]), atol=1e-5 ).all(), "Inverse transform should reverse transform"
def create_data_spec(self) -> dict: cfg = self.cfg data_spec = dict( time_idx=cfg.get("time_index"), target=cfg.get("target"), group_ids=cfg.get("target_keys"), target_normalizer=GroupNormalizer( groups=cfg.get("target_keys"), coerce_positive=1.0 ), # use softplus with beta=1.0 and normalize by group static_categoricals=cfg.get("static").categorical, static_reals=cfg.get("static").numerical, time_varying_known_categoricals=cfg.get("known").categorical, variable_groups=cfg.get("variable_groups"), time_varying_known_reals=cfg.get("known").numerical, time_varying_unknown_categoricals=cfg.get("unknown").categorical, time_varying_unknown_reals=cfg.get("unknown").numerical, ) return data_spec
def test_MultiNormalizer_fitted(): data = pd.DataFrame( dict(a=[1, 1, 2, 2, 3], b=[1.1, 1.1, 1.0, 5.0, 1.1], c=[1.1, 1.1, 1.0, 5.0, 1.1])) normalizer = MultiNormalizer( [GroupNormalizer(groups=["a"]), TorchNormalizer()]) with pytest.raises(NotFittedError): check_is_fitted(normalizer) normalizer.fit(data, data) try: check_is_fitted(normalizer.normalizers[0]) check_is_fitted(normalizer.normalizers[1]) check_is_fitted(normalizer) except NotFittedError: pytest.fail(f"{NotFittedError}")
time_varying_known_reals=[ "time_idx", "price_regular", "price_actual", "discount", "discount_in_percent" ], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp" ], constant_fill_strategy={"volume": 0}, categorical_encoders={"sku": NaNLabelEncoder(add_nan=True)}, ), dict(static_categoricals=["agency", "sku"]), dict(randomize_length=True, min_encoder_length=2), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(target_normalizer=GroupNormalizer(transformation="log1p")), dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"], transformation="softplus", center=False)), dict(target="agency"), # test multiple targets dict(target=["industry_volume", "volume"]), dict(target=["agency", "volume"]), dict(target=["agency", "volume"], min_encoder_length=1, min_prediction_length=1), dict(target=["agency", "volume"], weight="volume"), # test weights dict(target="volume", weight="volume"), ], scope="session",
"labor_day", "independence_day", "revolution_day_memorial", "regional_games", "fifa_u_17_world_cup", "football_gold_cup", "beer_capital", "music_fest", ] ), ), dict(time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"]), dict(time_varying_unknown_reals=["volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp"]), dict( target_normalizer=GroupNormalizer( groups=["agency", "sku"], log_scale=True, scale_by_group=True, log_zero_value=1.0 ) ), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()), categorical_encoders=dict(month=NaNLabelEncoder()), time_varying_known_categoricals=["month"], time_varying_known_reals=["time_idx", "price_regular"],
"fifa_u_17_world_cup", "football_gold_cup", "beer_capital", "music_fest", ]), ), dict(time_varying_known_reals=[ "time_idx", "price_regular", "discount_in_percent" ]), dict(time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp" ]), dict(target_normalizer=GroupNormalizer( groups=["agency", "sku"], transformation="log1p", scale_by_group=True, )), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()),
time_varying_known_reals=[ "time_idx", "price_regular", "price_actual", "discount", "discount_in_percent" ], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp" ], constant_fill_strategy={"volume": 0}, dropout_categoricals=["sku"], ), dict(static_categoricals=["agency", "sku"]), dict(randomize_length=True, min_encoder_length=2), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(target_normalizer=GroupNormalizer(log_scale=True)), dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"], coerce_positive=1.0)), dict(target="agency"), ]) def multiple_dataloaders_with_covariates(data_with_covariates, request): training_cutoff = "2016-09-01" max_encoder_length = 36 max_prediction_length = 6 params = request.param params.setdefault("target", "volume") training = TimeSeriesDataSet( data_with_covariates[lambda x: x.date < training_cutoff], time_idx="time_idx",
"independence_day", "revolution_day_memorial", "regional_games", "fifa_u_17_world_cup", "football_gold_cup", "beer_capital", "music_fest", ] ), ), dict(time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"]), dict(time_varying_unknown_reals=["volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp"]), dict( target_normalizer=GroupNormalizer( groups=["agency", "sku"], transformation="log1p", scale_by_group=True, ) ), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()), categorical_encoders=dict(month=NaNLabelEncoder()), time_varying_known_categoricals=["month"],
"regional_games", "fifa_u_17_world_cup", "football_gold_cup", "beer_capital", "music_fest", ]), ), dict(time_varying_known_reals=[ "time_idx", "price_regular", "discount_in_percent" ]), dict(time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp" ]), dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"], log_scale=True, scale_by_group=True, log_zero_value=1.0)), dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2), dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1), dict(predict_mode=True), dict(add_target_scales=True), dict(add_encoder_length=True), dict(add_encoder_length=True), dict(add_relative_time_idx=True), dict(weight="volume"), dict( scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()), categorical_encoders=dict(month=NaNLabelEncoder()),
variable_groups={"special_days": special_days}, time_varying_known_reals=[ "time_idx", "price_regular", "discount_in_percent" ], time_varying_unknown_categoricals=[], time_varying_unknown_reals=[ "volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp", "avg_volume_by_agency", "avg_volume_by_sku", ], target_normalizer=GroupNormalizer( groups=["agency", "sku"], coerce_positive=1.0 ), # use softplus with beta=1.0 and normalize by group add_relative_time_idx=True, # add as feature add_target_scales=True, # add as feature add_encoder_length=True, # add as feature ) # create validation set (predict=True) which means to predict the # last max_prediction_length points in time for each series validation = TimeSeriesDataSet.from_dataset(training, data, predict=True, stop_randomization=True) # create dataloaders for model batch_size = 128 train_dataloader = training.to_dataloader(train=True,