Пример #1
0
def test_GroupNormalizer(kwargs, groups):
    data = pd.DataFrame(dict(a=[1, 1, 2, 2, 3], b=[1.1, 1.1, 1.0, 5.0, 1.1]))
    defaults = dict(method="standard",
                    transformation=None,
                    center=True,
                    scale_by_group=False)
    defaults.update(kwargs)
    kwargs = defaults
    kwargs["groups"] = groups
    kwargs["scale_by_group"] = kwargs["scale_by_group"] and len(
        kwargs["groups"]) > 0

    if kwargs.get("transformation") in ["relu", "softplus"]:
        data.b = data.b - 2.0
    normalizer = GroupNormalizer(**kwargs)
    encoded = normalizer.fit_transform(data["b"], data)

    test_data = dict(
        prediction=torch.tensor([encoded[0]]),
        target_scale=torch.tensor(normalizer.get_parameters([1])).unsqueeze(0),
    )

    if kwargs.get("transformation") in ["relu", "softplus", "log1p"]:
        assert (normalizer(test_data) >=
                0).all(), "Inverse transform should yield only positive values"
    else:
        assert torch.isclose(
            normalizer(test_data), torch.tensor(data.b.iloc[0]),
            atol=1e-5).all(), "Inverse transform should reverse transform"
Пример #2
0
def dataloaders_with_covariates(data_with_covariates):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
        target="volume",
        # weight="weight",
        group_ids=["agency", "sku"],
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["volume"],
        static_categoricals=["agency"],
        max_encoder_length=max_encoder_length,
        max_prediction_length=max_prediction_length,
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                          coerce_positive=False),
    )

    validation = TimeSeriesDataSet.from_dataset(
        training,
        data_with_covariates,
        min_prediction_idx=training.index.time.max() + 1)
    batch_size = 4
    train_dataloader = training.to_dataloader(train=True,
                                              batch_size=batch_size,
                                              num_workers=0)
    val_dataloader = validation.to_dataloader(train=False,
                                              batch_size=batch_size,
                                              num_workers=0)

    return dict(train=train_dataloader, val=val_dataloader)
Пример #3
0
def dataloaders_with_covariates(data_with_covariates):
    return make_dataloaders(
        data_with_covariates,
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["volume"],
        static_categoricals=["agency"],
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"]),
    )
Пример #4
0
def dataloaders_with_covariates(data_with_covariates):
    return make_dataloaders(
        data_with_covariates.copy(),
        target="target",
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["target"],
        static_categoricals=["agency"],
        add_relative_time_idx=False,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                          center=False),
    )
Пример #5
0
def dataloaders_with_covariates(data_with_covariates):
    data_with_covariates["target"] = data_with_covariates["volume"].clip(1e-3, 1.0)
    return make_dataloaders(
        data_with_covariates,
        target="target",
        time_varying_known_reals=["discount"],
        time_varying_unknown_reals=["target"],
        static_categoricals=["agency"],
        add_relative_time_idx=True,
        target_normalizer=GroupNormalizer(groups=["agency", "sku"], center=False),
    )
Пример #6
0
def test_GroupNormalizer(kwargs, groups):
    data = pd.DataFrame(dict(a=[1, 1, 2, 2, 3], b=[1.1, 1.1, 1.0, 5.0, 1.1]))
    defaults = dict(
        method="standard", log_scale=False, coerce_positive=False, center=True, log_zero_value=0.0, scale_by_group=False
    )
    defaults.update(kwargs)
    kwargs = defaults
    kwargs["groups"] = groups
    kwargs["scale_by_group"] = kwargs["scale_by_group"] and len(kwargs["groups"]) > 0

    if kwargs["coerce_positive"] and kwargs["log_scale"]:
        with pytest.raises(AssertionError):
            normalizer = GroupNormalizer(**kwargs)
    else:
        if kwargs["coerce_positive"]:
            data.b = data.b - 2.0
        normalizer = GroupNormalizer(**kwargs)
        encoded = normalizer.fit_transform(data["b"], data)

        test_data = dict(
            prediction=torch.tensor([encoded.iloc[0]]),
            target_scale=torch.tensor(normalizer.get_parameters([1])).unsqueeze(0),
        )

        if kwargs["coerce_positive"]:
            assert (normalizer(test_data) >= 0).all(), "Inverse transform should yield only positive values"
        else:
            assert torch.isclose(
                normalizer(test_data), torch.tensor(data.b.iloc[0]), atol=1e-5
            ).all(), "Inverse transform should reverse transform"
Пример #7
0
 def create_data_spec(self) -> dict:
     cfg = self.cfg
     data_spec = dict(
         time_idx=cfg.get("time_index"),
         target=cfg.get("target"),
         group_ids=cfg.get("target_keys"),
         target_normalizer=GroupNormalizer(
             groups=cfg.get("target_keys"), coerce_positive=1.0
         ),  # use softplus with beta=1.0 and normalize by group
         static_categoricals=cfg.get("static").categorical,
         static_reals=cfg.get("static").numerical,
         time_varying_known_categoricals=cfg.get("known").categorical,
         variable_groups=cfg.get("variable_groups"),
         time_varying_known_reals=cfg.get("known").numerical,
         time_varying_unknown_categoricals=cfg.get("unknown").categorical,
         time_varying_unknown_reals=cfg.get("unknown").numerical,
     )
     return data_spec
Пример #8
0
def test_MultiNormalizer_fitted():
    data = pd.DataFrame(
        dict(a=[1, 1, 2, 2, 3],
             b=[1.1, 1.1, 1.0, 5.0, 1.1],
             c=[1.1, 1.1, 1.0, 5.0, 1.1]))

    normalizer = MultiNormalizer(
        [GroupNormalizer(groups=["a"]),
         TorchNormalizer()])

    with pytest.raises(NotFittedError):
        check_is_fitted(normalizer)

    normalizer.fit(data, data)

    try:
        check_is_fitted(normalizer.normalizers[0])
        check_is_fitted(normalizer.normalizers[1])
        check_is_fitted(normalizer)
    except NotFittedError:
        pytest.fail(f"{NotFittedError}")
Пример #9
0
         time_varying_known_reals=[
             "time_idx", "price_regular", "price_actual", "discount",
             "discount_in_percent"
         ],
         time_varying_unknown_categoricals=[],
         time_varying_unknown_reals=[
             "volume", "log_volume", "industry_volume", "soda_volume",
             "avg_max_temp"
         ],
         constant_fill_strategy={"volume": 0},
         categorical_encoders={"sku": NaNLabelEncoder(add_nan=True)},
     ),
     dict(static_categoricals=["agency", "sku"]),
     dict(randomize_length=True, min_encoder_length=2),
     dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
     dict(target_normalizer=GroupNormalizer(transformation="log1p")),
     dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                            transformation="softplus",
                                            center=False)),
     dict(target="agency"),
     # test multiple targets
     dict(target=["industry_volume", "volume"]),
     dict(target=["agency", "volume"]),
     dict(target=["agency", "volume"],
          min_encoder_length=1,
          min_prediction_length=1),
     dict(target=["agency", "volume"], weight="volume"),
     # test weights
     dict(target="volume", weight="volume"),
 ],
 scope="session",
Пример #10
0
             "labor_day",
             "independence_day",
             "revolution_day_memorial",
             "regional_games",
             "fifa_u_17_world_cup",
             "football_gold_cup",
             "beer_capital",
             "music_fest",
         ]
     ),
 ),
 dict(time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"]),
 dict(time_varying_unknown_reals=["volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp"]),
 dict(
     target_normalizer=GroupNormalizer(
         groups=["agency", "sku"], log_scale=True, scale_by_group=True, log_zero_value=1.0
     )
 ),
 dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
 dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1),
 dict(predict_mode=True),
 dict(add_target_scales=True),
 dict(add_encoder_length=True),
 dict(add_encoder_length=True),
 dict(add_relative_time_idx=True),
 dict(weight="volume"),
 dict(
     scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()),
     categorical_encoders=dict(month=NaNLabelEncoder()),
     time_varying_known_categoricals=["month"],
     time_varying_known_reals=["time_idx", "price_regular"],
Пример #11
0
         "fifa_u_17_world_cup",
         "football_gold_cup",
         "beer_capital",
         "music_fest",
     ]),
 ),
 dict(time_varying_known_reals=[
     "time_idx", "price_regular", "discount_in_percent"
 ]),
 dict(time_varying_unknown_reals=[
     "volume", "log_volume", "industry_volume", "soda_volume",
     "avg_max_temp"
 ]),
 dict(target_normalizer=GroupNormalizer(
     groups=["agency", "sku"],
     transformation="log1p",
     scale_by_group=True,
 )),
 dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
 dict(randomize_length=True,
      min_encoder_length=2,
      min_prediction_length=1),
 dict(predict_mode=True),
 dict(add_target_scales=True),
 dict(add_encoder_length=True),
 dict(add_encoder_length=True),
 dict(add_relative_time_idx=True),
 dict(weight="volume"),
 dict(
     scalers=dict(time_idx=GroupNormalizer(),
                  price_regular=StandardScaler()),
Пример #12
0
        time_varying_known_reals=[
            "time_idx", "price_regular", "price_actual", "discount",
            "discount_in_percent"
        ],
        time_varying_unknown_categoricals=[],
        time_varying_unknown_reals=[
            "volume", "log_volume", "industry_volume", "soda_volume",
            "avg_max_temp"
        ],
        constant_fill_strategy={"volume": 0},
        dropout_categoricals=["sku"],
    ),
    dict(static_categoricals=["agency", "sku"]),
    dict(randomize_length=True, min_encoder_length=2),
    dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
    dict(target_normalizer=GroupNormalizer(log_scale=True)),
    dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                           coerce_positive=1.0)),
    dict(target="agency"),
])
def multiple_dataloaders_with_covariates(data_with_covariates, request):
    training_cutoff = "2016-09-01"
    max_encoder_length = 36
    max_prediction_length = 6

    params = request.param
    params.setdefault("target", "volume")

    training = TimeSeriesDataSet(
        data_with_covariates[lambda x: x.date < training_cutoff],
        time_idx="time_idx",
Пример #13
0
             "independence_day",
             "revolution_day_memorial",
             "regional_games",
             "fifa_u_17_world_cup",
             "football_gold_cup",
             "beer_capital",
             "music_fest",
         ]
     ),
 ),
 dict(time_varying_known_reals=["time_idx", "price_regular", "discount_in_percent"]),
 dict(time_varying_unknown_reals=["volume", "log_volume", "industry_volume", "soda_volume", "avg_max_temp"]),
 dict(
     target_normalizer=GroupNormalizer(
         groups=["agency", "sku"],
         transformation="log1p",
         scale_by_group=True,
     )
 ),
 dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
 dict(randomize_length=True, min_encoder_length=2, min_prediction_length=1),
 dict(predict_mode=True),
 dict(add_target_scales=True),
 dict(add_encoder_length=True),
 dict(add_encoder_length=True),
 dict(add_relative_time_idx=True),
 dict(weight="volume"),
 dict(
     scalers=dict(time_idx=GroupNormalizer(), price_regular=StandardScaler()),
     categorical_encoders=dict(month=NaNLabelEncoder()),
     time_varying_known_categoricals=["month"],
         "regional_games",
         "fifa_u_17_world_cup",
         "football_gold_cup",
         "beer_capital",
         "music_fest",
     ]),
 ),
 dict(time_varying_known_reals=[
     "time_idx", "price_regular", "discount_in_percent"
 ]),
 dict(time_varying_unknown_reals=[
     "volume", "log_volume", "industry_volume", "soda_volume",
     "avg_max_temp"
 ]),
 dict(target_normalizer=GroupNormalizer(groups=["agency", "sku"],
                                        log_scale=True,
                                        scale_by_group=True,
                                        log_zero_value=1.0)),
 dict(target_normalizer=EncoderNormalizer(), min_encoder_length=2),
 dict(randomize_length=True,
      min_encoder_length=2,
      min_prediction_length=1),
 dict(predict_mode=True),
 dict(add_target_scales=True),
 dict(add_encoder_length=True),
 dict(add_encoder_length=True),
 dict(add_relative_time_idx=True),
 dict(weight="volume"),
 dict(
     scalers=dict(time_idx=GroupNormalizer(),
                  price_regular=StandardScaler()),
     categorical_encoders=dict(month=NaNLabelEncoder()),
    variable_groups={"special_days": special_days},
    time_varying_known_reals=[
        "time_idx", "price_regular", "discount_in_percent"
    ],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "volume",
        "log_volume",
        "industry_volume",
        "soda_volume",
        "avg_max_temp",
        "avg_volume_by_agency",
        "avg_volume_by_sku",
    ],
    target_normalizer=GroupNormalizer(
        groups=["agency", "sku"], coerce_positive=1.0
    ),  # use softplus with beta=1.0 and normalize by group
    add_relative_time_idx=True,  # add as feature
    add_target_scales=True,  # add as feature
    add_encoder_length=True,  # add as feature
)

# create validation set (predict=True) which means to predict the
# last max_prediction_length points in time for each series
validation = TimeSeriesDataSet.from_dataset(training,
                                            data,
                                            predict=True,
                                            stop_randomization=True)
# create dataloaders for model
batch_size = 128
train_dataloader = training.to_dataloader(train=True,