def test_add_time_features_df(): """Tests add_time_features_df""" # create indexed input date_list = pd.date_range(start=datetime.datetime(2019, 1, 1), periods=100, freq="H").tolist() df0 = pd.DataFrame({TIME_COL: date_list}, index=date_list) df = add_time_features_df(df=df0, time_col=TIME_COL, conti_year_origin=2018) assert df["year"][0] == 2019 assert df.shape[0] == df0.shape[0] hourly_data = generate_df_with_reg_for_tests( freq="H", periods=24 * 500, train_start_date=datetime.datetime(2018, 7, 1), conti_year_origin=2018) cols = [TIME_COL, "regressor1", "regressor_bool", "regressor_categ"] train_df = hourly_data["train_df"] df = add_time_features_df(df=train_df[cols], time_col=TIME_COL, conti_year_origin=2018) assert df["year"][0] == 2018 assert (df["dow_hr"][:3] == ["7_00", "7_01", "7_02"]).all() assert df.shape[0] == train_df.shape[0]
def df(): data = generate_df_with_reg_for_tests(freq="D", periods=20 * 7, train_frac=0.9, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor3"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] return df
def df_config(): data = generate_df_with_reg_for_tests(freq="W-MON", periods=140, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] model_template = "SILVERKITE" evaluation_metric = EvaluationMetricParam( cv_selection_metric=EvaluationMetricEnum.MeanAbsoluteError.name, agg_periods=7, agg_func=np.max, null_model_params={ "strategy": "quantile", "constant": None, "quantile": 0.5 }) evaluation_period = EvaluationPeriodParam(test_horizon=10, periods_between_train_test=5, cv_horizon=4, cv_min_train_periods=80, cv_expanding_window=False, cv_periods_between_splits=20, cv_periods_between_train_test=3, cv_max_splits=3) model_components = ModelComponentsParam( regressors={"regressor_cols": reg_cols}, custom={ "fit_algorithm_dict": { "fit_algorithm": "ridge", "fit_algorithm_params": { "cv": 2 } } }) computation = ComputationParam(verbose=2) forecast_horizon = 27 coverage = 0.90 config = ForecastConfig(model_template=model_template, computation_param=computation, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, forecast_horizon=forecast_horizon, model_components_param=model_components) return { "df": df, "config": config, "model_template": model_template, "reg_cols": reg_cols, }
def test_generate_df_with_reg_for_tests(): """Basic test of generate_df_with_reg_for_tests""" data = generate_df_with_reg_for_tests(freq="D", periods=20, train_frac=0.75, remove_extra_cols=True, mask_test_actuals=True) # test remove_extra_cols assert data["df"].shape == (20, 7) # test mask_test_actuals assert not data["train_df"][TIME_COL].isna().any() assert not data["train_df"][VALUE_COL].isna().any() assert not data["test_df"][TIME_COL].isna().any() assert data["test_df"][VALUE_COL].isna().all()
def df(): data = generate_df_with_reg_for_tests(freq="H", periods=300 * 24, train_start_date=datetime.datetime( 2018, 7, 1), remove_extra_cols=True, mask_test_actuals=True) df = data["df"] time_col = NEW_TIME_COL value_col = NEW_VALUE_COL df.rename({TIME_COL: time_col, VALUE_COL: value_col}, axis=1, inplace=True) regressor_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [time_col, value_col] + regressor_cols return df[keep_cols]
def test_get_basic_pipeline_apply_reg(): """Tests get_basic_pipeline fit and predict methods on a dataset with regressors, and checks if pipeline parameters can be set. """ df = generate_df_with_reg_for_tests("D", 50) # adds degenerate columns df["train_df"]["cst1"] = "constant" df["train_df"]["cst2"] = 1.0 df["test_df"]["cst1"] = "constant" df["test_df"]["cst2"] = 1.0 pipeline = get_basic_pipeline( estimator=SilverkiteEstimator(), score_func=EvaluationMetricEnum.MeanSquaredError.name, score_func_greater_is_better=False, agg_periods=None, agg_func=None, relative_error_tolerance=None, coverage=0.95, null_model_params=None, regressor_cols=[ "regressor1", "regressor2", "regressor3", "regressor_bool", "regressor_categ", "cst1", "cst2" ]) pipeline.fit(df["train_df"]) assert pipeline.named_steps["degenerate"].drop_cols == [] pipeline.predict(df["test_df"]) # drops degenerate columns, normalizes pipeline.set_params( degenerate__drop_degenerate=True, input__regressors_numeric__normalize__normalize_algorithm= "PowerTransformer", ) pipeline.fit(df["train_df"]) # (column order is swapped by column selectors and feature union) assert pipeline.named_steps["degenerate"].drop_cols == ["cst2", "cst1"] predictions = pipeline.predict(df["test_df"]) assert predictions.shape[0] == df["test_df"].shape[0] with pytest.raises( ValueError, match= "Invalid parameter unknown_param for estimator NormalizeTransformer" ): pipeline.set_params( degenerate__drop_degenerate=True, input__regressors_numeric__normalize__unknown_param= "PowerTransformer", )
def data(): """Generates dataset for test cases :return: pd.DataFrame with columns of type: datetime, number, number, boolean, object, category """ df = generate_df_with_reg_for_tests(freq="D", periods=50, remove_extra_cols=False)["df"] df["dow_categorical"] = df["str_dow"].astype("category") df = df[[ TIME_COL, "regressor1", "regressor2", "regressor_bool", "str_dow", "dow_categorical" ]] return df
def test_run_template_4(): """Runs custom template with monthly data and auto-regression""" data = generate_df_with_reg_for_tests( freq="MS", periods=48, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] forecast_horizon = data["test_df"].shape[0] model_components = ModelComponentsParam( custom=dict( fit_algorithm_dict=dict(fit_algorithm="linear"), extra_pred_cols=["ct2"]), autoregression=dict(autoreg_dict=dict(lag_dict=dict(orders=[1]))), uncertainty=dict(uncertainty_dict=None)) config = ForecastConfig( model_template=ModelTemplateEnum.SK.name, forecast_horizon=forecast_horizon, coverage=0.9, model_components_param=model_components, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") result = Forecaster().run_forecast_config( df=df, config=config, ) rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name() assert result.backtest.test_evaluation[rmse] == pytest.approx(4.95, rel=1e-1) check_forecast_pipeline_result( result, coverage=0.9, strategy=None, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, greater_is_better=False)
def test_run_template_2(): """Runs custom template with all options""" data = generate_df_with_reg_for_tests( freq="D", periods=400, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] forecast_horizon = data["test_df"].shape[0] daily_event_df_dict = generate_holiday_events( countries=["UnitedStates"], holidays_to_model_separately=["New Year's Day"], year_start=2017, year_end=2022, pre_num=2, post_num=2) event_pred_cols = get_event_pred_cols(daily_event_df_dict) model_components = ModelComponentsParam( seasonality={ "fs_components_df": pd.DataFrame({ "name": ["tow", "tom", "toq", "toy"], "period": [7.0, 1.0, 1.0, 1.0], "order": [2, 1, 1, 5], "seas_names": ["weekly", "monthly", "quarterly", "yearly"] }) }, events={ "daily_event_df_dict": daily_event_df_dict }, changepoints={ "changepoints_dict": { "method": "auto", "yearly_seasonality_order": 3, "regularization_strength": 0.5, "resample_freq": "14D", "potential_changepoint_distance": "56D", "no_changepoint_proportion_from_end": 0.2 }, "seasonality_changepoints_dict": { "potential_changepoint_distance": "60D", "regularization_strength": 0.5, "no_changepoint_proportion_from_end": 0.2 }, }, autoregression=None, uncertainty={ "uncertainty_dict": None, }, custom={ "origin_for_time_vars": None, "extra_pred_cols": [["ct1"] + reg_cols + event_pred_cols], # growth, regressors, events "fit_algorithm_dict": { "fit_algorithm": "ridge", "fit_algorithm_params": {"cv": 2} }, "min_admissible_value": min(df[VALUE_COL]) - abs(max(df[VALUE_COL])), "max_admissible_value": max(df[VALUE_COL]) * 2, } ) config = ForecastConfig( model_template=ModelTemplateEnum.SK.name, forecast_horizon=forecast_horizon, coverage=0.9, model_components_param=model_components, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") result = Forecaster().run_forecast_config( df=df, config=config, ) rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name() q80 = EvaluationMetricEnum.Quantile80.get_metric_name() assert result.backtest.test_evaluation[rmse] == pytest.approx(2.692, rel=1e-2) assert result.backtest.test_evaluation[q80] == pytest.approx(1.531, rel=1e-2) assert result.backtest.test_evaluation[PREDICTION_BAND_COVERAGE] == pytest.approx(0.823, rel=1e-2) assert result.forecast.train_evaluation[rmse] == pytest.approx(2.304, rel=1e-2) assert result.forecast.train_evaluation[q80] == pytest.approx(0.921, rel=1e-2) assert result.forecast.train_evaluation[PREDICTION_BAND_COVERAGE] == pytest.approx(0.897, rel=1e-2) check_forecast_pipeline_result( result, coverage=0.9, strategy=None, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, greater_is_better=False)
def test_silverkite_template_custom(model_components_param): """"Tests simple_silverkite_template with custom parameters, and data that has regressors""" data = generate_df_with_reg_for_tests( freq="H", periods=300*24, remove_extra_cols=True, mask_test_actuals=True) df = data["df"] time_col = "some_time_col" value_col = "some_value_col" df.rename({ TIME_COL: time_col, VALUE_COL: value_col }, axis=1, inplace=True) metric = EvaluationMetricEnum.MeanAbsoluteError # anomaly adjustment adds 10.0 to every record adjustment_size = 10.0 anomaly_df = pd.DataFrame({ START_DATE_COL: [df[time_col].min()], END_DATE_COL: [df[time_col].max()], ADJUSTMENT_DELTA_COL: [adjustment_size], METRIC_COL: [value_col] }) anomaly_info = { "value_col": VALUE_COL, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": {METRIC_COL: VALUE_COL}, "adjustment_method": "add" } metadata = MetadataParam( time_col=time_col, value_col=value_col, freq="H", date_format="%Y-%m-%d-%H", train_end_date=datetime.datetime(2019, 7, 1), anomaly_info=anomaly_info ) evaluation_metric = EvaluationMetricParam( cv_selection_metric=metric.name, cv_report_metrics=[EvaluationMetricEnum.MedianAbsolutePercentError.name], agg_periods=24, agg_func=np.max, null_model_params={ "strategy": "quantile", "constant": None, "quantile": 0.8 }, relative_error_tolerance=0.01 ) evaluation_period = EvaluationPeriodParam( test_horizon=1, periods_between_train_test=2, cv_horizon=3, cv_min_train_periods=4, cv_expanding_window=True, cv_periods_between_splits=5, cv_periods_between_train_test=6, cv_max_splits=7 ) computation = ComputationParam( hyperparameter_budget=10, n_jobs=None, verbose=1 ) forecast_horizon = 20 coverage = 0.7 template = SilverkiteTemplate() params = template.apply_template_for_pipeline_params( df=df, config=ForecastConfig( model_template=ModelTemplateEnum.SK.name, metadata_param=metadata, forecast_horizon=forecast_horizon, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, model_components_param=model_components_param, computation_param=computation ) ) pipeline = params.pop("pipeline", None) expected_params = dict( df=df, time_col=time_col, value_col=value_col, date_format=metadata.date_format, freq=metadata.freq, train_end_date=metadata.train_end_date, anomaly_info=metadata.anomaly_info, # model regressor_cols=template.regressor_cols, estimator=None, hyperparameter_grid=template.hyperparameter_grid, hyperparameter_budget=computation.hyperparameter_budget, n_jobs=computation.n_jobs, verbose=computation.verbose, # forecast forecast_horizon=forecast_horizon, coverage=coverage, test_horizon=evaluation_period.test_horizon, periods_between_train_test=evaluation_period.periods_between_train_test, agg_periods=evaluation_metric.agg_periods, agg_func=evaluation_metric.agg_func, relative_error_tolerance=evaluation_metric.relative_error_tolerance, # evaluation score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better(), cv_report_metrics=evaluation_metric.cv_report_metrics, null_model_params=evaluation_metric.null_model_params, # CV cv_horizon=evaluation_period.cv_horizon, cv_min_train_periods=evaluation_period.cv_min_train_periods, cv_expanding_window=evaluation_period.cv_expanding_window, cv_periods_between_splits=evaluation_period.cv_periods_between_splits, cv_periods_between_train_test=evaluation_period.cv_periods_between_train_test, cv_max_splits=evaluation_period.cv_max_splits ) assert_basic_pipeline_equal(pipeline, template.pipeline) assert_equal(params, expected_params)
def test_gcd_train_end_date_regressor(): """Tests train_end_date for data with regressors""" data = generate_df_with_reg_for_tests(freq="D", periods=30, train_start_date=datetime.datetime( 2018, 1, 1), remove_extra_cols=True, mask_test_actuals=True) regressor_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + regressor_cols df = data["df"][keep_cols].copy() # Setting NaN values at the end df.loc[df.tail(2).index, "regressor1"] = np.nan df.loc[df.tail(4).index, "regressor2"] = np.nan df.loc[df.tail(6).index, "regressor_categ"] = np.nan df.loc[df.tail(8).index, VALUE_COL] = np.nan # last date with a value result_train_end_date = datetime.datetime(2018, 1, 22) # default train_end_date, default regressor_cols with pytest.warns(UserWarning) as record: canonical_data_dict = get_canonical_data(df=df, train_end_date=None, regressor_cols=None) assert f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["df"].shape == df.shape assert canonical_data_dict["fit_df"].shape == (22, 2) assert canonical_data_dict["regressor_cols"] == [] assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL] assert canonical_data_dict["train_end_date"] == result_train_end_date assert canonical_data_dict[ "last_date_for_val"] == result_train_end_date assert canonical_data_dict["last_date_for_reg"] is None # train_end_date later than last date in df, all available regressor_cols with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 2, 10) canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["fit_df"].shape == (22, 5) assert canonical_data_dict["regressor_cols"] == regressor_cols assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL ] + regressor_cols assert canonical_data_dict["train_end_date"] == result_train_end_date assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 28) # train_end_date in between last date in df and last date before null # user passes no regressor_cols with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 25) canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=None) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({result_train_end_date})." in record[0].message.args[0] assert canonical_data_dict["fit_df"].shape == (22, 2) assert canonical_data_dict["regressor_cols"] == [] assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL] assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] is None # train end date equal to last date before null # user requests a subset of the regressor_cols train_end_date = datetime.datetime(2018, 1, 22) regressor_cols = ["regressor2"] canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert canonical_data_dict["fit_df"].shape == (22, 3) assert canonical_data_dict["regressor_cols"] == regressor_cols assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL ] + regressor_cols assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 26) # train_end_date smaller than last date before null # user requests regressor_cols that does not exist in df with pytest.warns(UserWarning) as record: train_end_date = datetime.datetime(2018, 1, 20) regressor_cols = ["regressor1", "regressor4", "regressor5"] canonical_data_dict = get_canonical_data(df=df, train_end_date=train_end_date, regressor_cols=regressor_cols) assert canonical_data_dict["fit_df"].shape == (20, 3) assert canonical_data_dict["regressor_cols"] == ["regressor1"] assert canonical_data_dict["fit_cols"] == [ TIME_COL, VALUE_COL, "regressor1" ] assert canonical_data_dict["train_end_date"] == datetime.datetime( 2018, 1, 20) assert canonical_data_dict["last_date_for_val"] == datetime.datetime( 2018, 1, 22) assert canonical_data_dict["last_date_for_reg"] == datetime.datetime( 2018, 1, 28) assert (f"The following columns are not available to use as " f"regressors: ['regressor4', 'regressor5']" ) in record[0].message.args[0]
def test_run_template_5(): """Runs custom template with monthly data, auto-regression and lagged regressors""" data = generate_df_with_reg_for_tests( freq="MS", periods=48, remove_extra_cols=True, mask_test_actuals=True) reg_cols_all = ["regressor1", "regressor2", "regressor_categ"] reg_cols = ["regressor1"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols_all df = data["df"][keep_cols] forecast_horizon = data["test_df"].shape[0] model_components = ModelComponentsParam( custom=dict( fit_algorithm_dict=dict(fit_algorithm="linear"), extra_pred_cols=reg_cols), autoregression=dict(autoreg_dict=dict(lag_dict=dict(orders=[1]))), lagged_regressors={ "lagged_regressor_dict": [ {"regressor2": "auto"}, {"regressor_categ": {"lag_dict": {"orders": [5]}}} ]}, uncertainty=dict(uncertainty_dict=None)) config = ForecastConfig( model_template=ModelTemplateEnum.SK.name, forecast_horizon=forecast_horizon, coverage=0.9, model_components_param=model_components, ) with warnings.catch_warnings(): warnings.simplefilter("ignore") result = Forecaster().run_forecast_config( df=df, config=config, ) rmse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name() assert result.backtest.test_evaluation[rmse] == pytest.approx(4.46, rel=1e-1) check_forecast_pipeline_result( result, coverage=0.9, strategy=None, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, greater_is_better=False) # Checks lagged regressor columns actual_pred_cols = set(result.model[-1].model_dict["pred_cols"]) actual_x_mat_cols = set(result.model[-1].model_dict["x_mat"].columns) expected_pred_cols = { 'regressor1', 'y_lag1', 'regressor_categ_lag5' } expected_x_mat_cols = { 'regressor1', 'y_lag1', 'regressor_categ_lag5[T.c2]', 'regressor_categ_lag5[T.c2]' } assert expected_pred_cols.issubset(actual_pred_cols) assert expected_x_mat_cols.issubset(actual_x_mat_cols)
def test_forecast_pipeline_rolling_evaluation_prophet(): """Checks the output rolling evaluation with Prophet template""" data = generate_df_with_reg_for_tests(freq="D", periods=30, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor3"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] hyperparameter_grid = { "estimator__weekly_seasonality": [True], "estimator__daily_seasonality": [True, False], "estimator__n_changepoints": [0], # to speed up test case, remove for better fit "estimator__uncertainty_samples": [10], # to speed up test case "estimator__add_regressor_dict": [{ "regressor1": { "prior_scale": 10, "standardize": True, "mode": 'additive' }, "regressor2": { "prior_scale": 15, "standardize": False, "mode": 'additive' }, "regressor3": {} }] } pipeline_params = mock_pipeline( df=df, forecast_horizon=3, regressor_cols=["regressor1", "regressor2", "regressor3"], estimator=ProphetEstimator(), hyperparameter_grid=hyperparameter_grid) tscv = RollingTimeSeriesSplit(forecast_horizon=3, expanding_window=True, max_splits=1) rolling_evaluation = forecast_pipeline_rolling_evaluation( pipeline_params=pipeline_params, tscv=tscv) expected_splits_n = tscv.max_splits assert len(rolling_evaluation.keys()) == expected_splits_n assert set(rolling_evaluation.keys()) == {"split_0"} split0_output = rolling_evaluation["split_0"] assert round(split0_output["runtime_sec"], 3) == split0_output["runtime_sec"] pipeline_result = split0_output["pipeline_result"] # Calculates expected pipeline train, test = list(tscv.split(X=df))[0] df_train = df.loc[train] pipeline_params_updated = pipeline_params pipeline_params_updated["test_horizon"] = 0 pipeline_params_updated["df"] = df_train expected_pipeline_result = forecast_pipeline(**pipeline_params_updated) assert pipeline_result.backtest is None # Checks output is identical when there is only 1 split pipeline_grid_search = summarize_grid_search_results( pipeline_result.grid_search, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name) expected_grid_search = summarize_grid_search_results( expected_pipeline_result.grid_search, score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name) assert_equal(pipeline_grid_search["mean_test_MAPE"], expected_grid_search["mean_test_MAPE"]) assert_equal(pipeline_result.grid_search.cv.__dict__, expected_pipeline_result.grid_search.cv.__dict__) # Checks forecast df has the correct number of rows expected_rows = pipeline_result.timeseries.fit_df.shape[ 0] + tscv.forecast_horizon assert pipeline_result.forecast.df.shape[0] == expected_rows
def daily_data_reg(): return generate_df_with_reg_for_tests( freq="D", periods=500)
def test_train_end_date_with_regressors(): """Tests make_future_dataframe and train_end_date with regressors""" data = generate_df_with_reg_for_tests(freq="D", periods=30, train_start_date=datetime.datetime( 2018, 1, 1), remove_extra_cols=True, mask_test_actuals=True) regressor_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + regressor_cols df = data["df"][keep_cols].copy() # Setting NaN values at the end df.loc[df.tail(2).index, "regressor1"] = np.nan df.loc[df.tail(4).index, "regressor2"] = np.nan df.loc[df.tail(6).index, "regressor_categ"] = np.nan df.loc[df.tail(8).index, VALUE_COL] = np.nan # default train_end_date, default regressor_cols with pytest.warns(UserWarning) as record: ts = UnivariateTimeSeries() ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=None, regressor_cols=None) assert f"{ts.original_value_col} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({ts.train_end_date})." in record[0].message.args[0] assert ts.train_end_date == dt(2018, 1, 22) assert ts.fit_df.shape == (22, 2) assert ts.last_date_for_val == df[ df[VALUE_COL].notnull()][TIME_COL].max() assert ts.last_date_for_reg is None result = ts.make_future_dataframe(periods=10, include_history=True) expected = pd.DataFrame({ TIME_COL: pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"), VALUE_COL: np.concatenate([ts.fit_y, np.repeat(np.nan, 10)]) }) expected.index = expected[TIME_COL] expected.index.name = None assert_frame_equal(result, expected) # train_end_date later than last date in df, all available regressor_cols with pytest.warns(UserWarning) as record: ts = UnivariateTimeSeries() train_end_date = dt(2018, 2, 10) ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=train_end_date, regressor_cols=regressor_cols) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({ts.train_end_date})." in record[0].message.args[0] assert ts.last_date_for_val == dt(2018, 1, 22) assert ts.last_date_for_reg == dt(2018, 1, 28) result = ts.make_future_dataframe(periods=10, include_history=False) expected = df.copy()[22:28] expected.loc[expected.tail(6).index, VALUE_COL] = np.nan expected.index = expected[TIME_COL] expected.index.name = None assert_frame_equal(result, expected) # train_end_date in between last date in df and last date before null # user passes no regressor_cols with pytest.warns(UserWarning) as record: ts = UnivariateTimeSeries() train_end_date = dt(2018, 1, 25) regressor_cols = [] ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=train_end_date, regressor_cols=regressor_cols) assert f"Input timestamp for the parameter 'train_end_date' " \ f"({train_end_date}) either exceeds the last available timestamp or" \ f"{VALUE_COL} column of the provided TimeSeries contains null " \ f"values at the end. Setting 'train_end_date' to the last timestamp with a " \ f"non-null value ({ts.train_end_date})." in record[0].message.args[0] assert ts.train_end_date == dt(2018, 1, 22) assert ts.last_date_for_reg is None result = ts.make_future_dataframe(periods=10, include_history=True) expected = pd.DataFrame({ TIME_COL: pd.date_range(start=dt(2018, 1, 1), periods=32, freq="D"), VALUE_COL: np.concatenate([ts.fit_y, np.repeat(np.nan, 10)]) }) expected.index = expected[TIME_COL] expected.index.name = None assert_frame_equal(result, expected) # train end date equal to last date before null # user requests a subset of the regressor_cols with pytest.warns(UserWarning) as record: ts = UnivariateTimeSeries() train_end_date = dt(2018, 1, 22) regressor_cols = ["regressor2"] ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=train_end_date, regressor_cols=regressor_cols) assert ts.train_end_date == dt(2018, 1, 22) assert ts.last_date_for_reg == dt(2018, 1, 26) result = ts.make_future_dataframe(periods=10, include_history=True) assert "Provided periods '10' is more than allowed ('4') due to the length of " \ "regressor columns. Using '4'." in record[0].message.args[0] expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor2"]] expected = expected[expected.index <= ts.last_date_for_reg] assert_frame_equal(result, expected) # train_end_date smaller than last date before null # user requests regressor_cols that does not exist in df with pytest.warns(UserWarning) as record: ts = UnivariateTimeSeries() train_end_date = dt(2018, 1, 20) regressor_cols = ["regressor1", "regressor4", "regressor5"] ts.load_data(df, TIME_COL, VALUE_COL, train_end_date=train_end_date, regressor_cols=regressor_cols) assert ts.train_end_date == dt(2018, 1, 20) assert ts.last_date_for_reg == dt(2018, 1, 28) assert (f"The following columns are not available to use as " f"regressors: ['regressor4', 'regressor5']" ) in record[0].message.args[0] result = ts.make_future_dataframe(periods=10, include_history=True) expected = ts.df.copy()[[TIME_COL, VALUE_COL, "regressor1"]] expected = expected[expected.index <= ts.last_date_for_reg] assert_frame_equal(result, expected)
def test_prophet_template_custom(): """Tests prophet_template with custom values, with long range input""" # prepares input data data = generate_df_with_reg_for_tests(freq="H", periods=300 * 24, remove_extra_cols=True, mask_test_actuals=True) df = data["df"] time_col = "some_time_col" value_col = "some_value_col" df.rename({ cst.TIME_COL: time_col, cst.VALUE_COL: value_col }, axis=1, inplace=True) # prepares params and calls template metric = EvaluationMetricEnum.MeanAbsoluteError # anomaly adjustment adds 10.0 to every record adjustment_size = 10.0 anomaly_df = pd.DataFrame({ cst.START_DATE_COL: [df[time_col].min()], cst.END_DATE_COL: [df[time_col].max()], cst.ADJUSTMENT_DELTA_COL: [adjustment_size], cst.METRIC_COL: [value_col] }) anomaly_info = { "value_col": cst.VALUE_COL, "anomaly_df": anomaly_df, "start_date_col": cst.START_DATE_COL, "end_date_col": cst.END_DATE_COL, "adjustment_delta_col": cst.ADJUSTMENT_DELTA_COL, "filter_by_dict": { cst.METRIC_COL: cst.VALUE_COL }, "adjustment_method": "add" } metadata = MetadataParam( time_col=time_col, value_col=value_col, freq="H", date_format="%Y-%m-%d-%H", train_end_date=datetime.datetime(2019, 7, 1), anomaly_info=anomaly_info, ) evaluation_metric = EvaluationMetricParam( cv_selection_metric=metric.name, cv_report_metrics=[ EvaluationMetricEnum.MedianAbsolutePercentError.name ], agg_periods=24, agg_func=np.max, null_model_params={ "strategy": "quantile", "constant": None, "quantile": 0.8 }, relative_error_tolerance=0.01) evaluation_period = EvaluationPeriodParam(test_horizon=1, periods_between_train_test=2, cv_horizon=3, cv_min_train_periods=4, cv_expanding_window=True, cv_periods_between_splits=5, cv_periods_between_train_test=6, cv_max_splits=7) model_components = ModelComponentsParam( seasonality={ "yearly_seasonality": [True], "weekly_seasonality": [False], "daily_seasonality": [4], "add_seasonality_dict": [{ "yearly": { "period": 365.25, "fourier_order": 20, "prior_scale": 20.0 }, "quarterly": { "period": 365.25 / 4, "fourier_order": 15 }, "weekly": { "period": 7, "fourier_order": 35, "prior_scale": 30.0 } }] }, growth={"growth_term": "linear"}, events={ "holiday_lookup_countries": ["UnitedStates", "UnitedKingdom", "India"], "holiday_pre_num_days": [2], "holiday_post_num_days": [3], "holidays_prior_scale": [5.0] }, regressors={ "add_regressor_dict": [{ "regressor1": { "prior_scale": 10.0, "mode": 'additive' }, "regressor2": { "prior_scale": 20.0, "mode": 'multiplicative' }, }] }, changepoints={ "changepoint_prior_scale": [0.05], "changepoints": [None], "n_changepoints": [50], "changepoint_range": [0.9] }, uncertainty={ "mcmc_samples": [500], "uncertainty_samples": [2000] }, hyperparameter_override={ "input__response__null__impute_algorithm": "ts_interpolate", "input__response__null__impute_params": { "orders": [7, 14] }, "input__regressors_numeric__normalize__normalize_algorithm": "RobustScaler", }) computation = ComputationParam(hyperparameter_budget=10, n_jobs=None, verbose=1) forecast_horizon = 20 coverage = 0.7 config = ForecastConfig(model_template=ModelTemplateEnum.PROPHET.name, metadata_param=metadata, forecast_horizon=forecast_horizon, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, model_components_param=model_components, computation_param=computation) template = ProphetTemplate() params = template.apply_template_for_pipeline_params(df=df, config=config) pipeline = params.pop("pipeline", None) # Adding start_year and end_year based on the input df model_components.events["start_year"] = df[time_col].min().year model_components.events["end_year"] = df[time_col].max().year expected_params = dict( df=df, time_col=time_col, value_col=value_col, date_format=metadata.date_format, freq=metadata.freq, train_end_date=metadata.train_end_date, anomaly_info=metadata.anomaly_info, # model regressor_cols=template.regressor_cols, estimator=None, hyperparameter_grid=template.hyperparameter_grid, hyperparameter_budget=computation.hyperparameter_budget, n_jobs=computation.n_jobs, verbose=computation.verbose, # forecast forecast_horizon=forecast_horizon, coverage=coverage, test_horizon=evaluation_period.test_horizon, periods_between_train_test=evaluation_period. periods_between_train_test, agg_periods=evaluation_metric.agg_periods, agg_func=evaluation_metric.agg_func, # evaluation score_func=metric.name, score_func_greater_is_better=metric.get_metric_greater_is_better(), cv_report_metrics=evaluation_metric.cv_report_metrics, null_model_params=evaluation_metric.null_model_params, relative_error_tolerance=evaluation_metric.relative_error_tolerance, # CV cv_horizon=evaluation_period.cv_horizon, cv_min_train_periods=evaluation_period.cv_min_train_periods, cv_expanding_window=evaluation_period.cv_expanding_window, cv_periods_between_splits=evaluation_period.cv_periods_between_splits, cv_periods_between_train_test=evaluation_period. cv_periods_between_train_test, cv_max_splits=evaluation_period.cv_max_splits) assert_basic_pipeline_equal(pipeline, template.pipeline) assert_equal(params, expected_params)
def test_forecast_pipeline_rolling_evaluation_silverkite(): """Checks the output rolling evaluation with Silverkite template""" data = generate_df_with_reg_for_tests( freq="1D", periods=20 * 7, # short-term: 20 weeks of data remove_extra_cols=True, mask_test_actuals=True) regressor_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + regressor_cols df = data["df"][keep_cols] coverage = 0.1 hyperparameter_grid = { "estimator__origin_for_time_vars": [None], # inferred from training data "estimator__fs_components_df": [ pd.DataFrame({ "name": ["tow"], "period": [7.0], "order": [3], "seas_names": ["weekly"] }) ], "estimator__extra_pred_cols": [regressor_cols, regressor_cols + ["ct_sqrt"] ], # two cases: no growth term and single growth term "estimator__fit_algorithm_dict": [{ "fit_algorithm": "linear" }] } pipeline_params = mock_pipeline( df=df, time_col=TIME_COL, value_col=VALUE_COL, date_format=None, # not recommended, but possible to specify freq=None, regressor_cols=regressor_cols, estimator=SilverkiteEstimator(), hyperparameter_grid=hyperparameter_grid, hyperparameter_budget=1, n_jobs=1, forecast_horizon=2 * 7, coverage=coverage, test_horizon=2 * 7, periods_between_train_test=2 * 7, agg_periods=7, agg_func=np.mean, score_func=mean_absolute_error, # callable score_func null_model_params=None, cv_horizon=1 * 7, cv_expanding_window=True, cv_min_train_periods=8 * 7, cv_periods_between_splits=7, cv_periods_between_train_test=3 * 7, cv_max_splits=2) tscv = RollingTimeSeriesSplit(forecast_horizon=2 * 7, min_train_periods=10 * 7, expanding_window=True, use_most_recent_splits=True, periods_between_splits=2 * 7, periods_between_train_test=2 * 7, max_splits=3) rolling_evaluation = forecast_pipeline_rolling_evaluation( pipeline_params=pipeline_params, tscv=tscv) expected_splits_n = tscv.max_splits assert len(rolling_evaluation.keys()) == expected_splits_n assert set(rolling_evaluation.keys()) == {"split_0", "split_1", "split_2"} time_col = pipeline_params["time_col"] for split_num, (train, test) in enumerate(tscv.split(X=df)): split_output = rolling_evaluation[f"split_{split_num}"] assert round(split_output["runtime_sec"], 3) == split_output["runtime_sec"] pipeline_result = split_output["pipeline_result"] # Checks every split uses all the available data for training ts = pipeline_result.timeseries train_end_date = df.iloc[train[-1]][time_col] assert ts.train_end_date == train_end_date assert pipeline_result.backtest is None # Checks every split has forecast for train+test periods passed by tscv forecast = pipeline_result.forecast assert forecast.df.shape[0] == ts.fit_df.shape[ 0] + tscv.periods_between_train_test + tscv.forecast_horizon
def test_silverkite_with_components_daily_data(): """Tests get_components, plot_components, plot_trend, plot_seasonalities with daily data and missing input values. """ daily_data = generate_df_with_reg_for_tests( freq="D", periods=20, train_start_date=datetime.datetime(2018, 1, 1), conti_year_origin=2018) train_df = daily_data["train_df"].copy() train_df.loc[[2, 4, 7], cst.VALUE_COL] = np.nan # creates missing values params_daily = params_components() # SilverkiteEstimator parameters # converts into parameters for `forecast_silverkite` coverage = params_daily.pop("coverage") # removes daily seasonality terms params_daily["fs_components_df"] = pd.DataFrame({ "name": ["tow", "ct1"], "period": [7.0, 1.0], "order": [4, 5], "seas_names": ["weekly", "yearly"] }) model = BaseSilverkiteEstimator( coverage=coverage, uncertainty_dict=params_daily["uncertainty_dict"]) with pytest.raises(NotFittedError, match="Call `fit` before calling `plot_components`."): model.plot_components() with pytest.warns(Warning): # suppress warnings from conf_interval.py and sklearn # a subclass's fit() method will have these steps model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL) silverkite = SilverkiteForecast() model.model_dict = silverkite.forecast(df=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL, **params_daily) model.finish_fit() # Tests plot_components with pytest.warns(Warning) as record: title = "Custom component plot" model._set_silverkite_diagnostics_params() fig = model.plot_components( names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title) expected_rows = 3 assert len(fig.data) == expected_rows + 1 # includes changepoints assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title assert f"The following components have not been specified in the model: " \ f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0] # Missing component error with pytest.raises( ValueError, match= "None of the provided components have been specified in the model." ): model.plot_components(names=["DUMMY"]) # Tests plot_trend title = "Custom trend plot" fig = model.plot_trend(title=title) expected_rows = 2 assert len(fig.data) == expected_rows + 1 # includes changepoints assert [fig.data[i].name for i in range(expected_rows)] == [cst.VALUE_COL, "trend"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.title["text"] == title # Tests plot_seasonalities with pytest.warns(Warning): # suppresses the warning on seasonalities removed title = "Custom seasonality plot" fig = model.plot_seasonalities(title=title) expected_rows = 3 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == "Day of week" assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "weekly" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title # Component plot error if `fit_algorithm` is "rf" or "gradient_boosting" params_daily["fit_algorithm"] = "rf" model = BaseSilverkiteEstimator( coverage=coverage, uncertainty_dict=params_daily["uncertainty_dict"]) with pytest.warns(Warning): # suppress warnings from conf_interval.py and sklearn # a subclass's fit() method will have these steps model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL) model.model_dict = silverkite.forecast(df=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL, **params_daily) model.finish_fit() assert model.coef_ is None with pytest.raises( NotImplementedError, match= "Component plot has only been implemented for additive linear models." ): model.plot_components() with pytest.raises( NotImplementedError, match= "Component plot has only been implemented for additive linear models." ): model.plot_trend() with pytest.raises( NotImplementedError, match= "Component plot has only been implemented for additive linear models." ): model.plot_seasonalities()
def test_run_auto_arima_template_custom(): """Tests running auto arima template through the pipeline""" data = generate_df_with_reg_for_tests(freq="D", periods=50, train_frac=0.8, conti_year_origin=2018, remove_extra_cols=True, mask_test_actuals=True) # select relevant columns for testing relevant_cols = [ cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3" ] df = data["df"][relevant_cols] forecast_horizon = data["fut_time_num"] # Model components - custom holidays; other params as defaults model_components = ModelComponentsParam( # Everything except `custom` and `hyperparameter_override` are ignored seasonality={ "seasonality_mode": ["additive"], "yearly_seasonality": ["auto"], "weekly_seasonality": [True], "daily_seasonality": ["auto"], }, growth={"growth_term": ["linear"]}, events={ "holiday_pre_num_days": [1], "holiday_post_num_days": [1], "holidays_prior_scale": [1.0] }, changepoints={ "changepoint_prior_scale": [0.05], "n_changepoints": [1], "changepoint_range": [0.5], }, regressors={ "add_regressor_dict": [{ "regressor1": { "prior_scale": 10, "standardize": True, "mode": "additive" }, "regressor2": { "prior_scale": 15, "standardize": False, "mode": "additive" }, "regressor3": {} }] }, uncertainty={"uncertainty_samples": [10]}, custom={ "max_order": [10], "information_criterion": ["bic"] }) metadata = MetadataParam( time_col=cst.TIME_COL, value_col=cst.VALUE_COL, freq="D", ) evaluation_period = EvaluationPeriodParam( test_horizon=5, # speeds up test case periods_between_train_test=5, cv_horizon=0, # speeds up test case ) config = ForecastConfig( model_template=ModelTemplateEnum.AUTO_ARIMA.name, metadata_param=metadata, forecast_horizon=forecast_horizon, coverage=0.95, model_components_param=model_components, evaluation_period_param=evaluation_period, ) result = Forecaster().run_forecast_config( df=df, config=config, ) forecast_df = result.forecast.df_test.reset_index(drop=True) expected_cols = [ "ts", "actual", "forecast", "forecast_lower", "forecast_upper" ] assert list(forecast_df.columns) == expected_cols assert result.backtest.coverage == 0.95, "coverage is not correct" # NB: coverage is poor because of very small dataset size and low uncertainty_samples assert result.backtest.train_evaluation[ cst.PREDICTION_BAND_COVERAGE] is not None assert result.backtest.test_evaluation[ cst.PREDICTION_BAND_COVERAGE] is not None assert result.backtest.train_evaluation["MSE"] is not None assert result.backtest.test_evaluation["MSE"] is not None assert result.forecast.train_evaluation[ cst.PREDICTION_BAND_COVERAGE] is not None assert result.forecast.train_evaluation["MSE"] is not None
def test_plot_components(): """Tests plot_components. Because component plots are implemented in `base_silverkite_estimator.py,` the bulk of the testing is done there. This file only tests inheritance and compatibility of the trained_model generated by this estimator's fit. """ daily_data = generate_df_with_reg_for_tests( freq="D", periods=20, train_start_date=datetime.datetime(2018, 1, 1), conti_year_origin=2018) train_df = daily_data.get("train_df").copy() params_daily = params_components() fit_algorithm = params_daily.pop("fit_algorithm", "linear") fit_algorithm_params = params_daily.pop("fit_algorithm_params", None) params_daily["fit_algorithm_dict"] = { "fit_algorithm": fit_algorithm, "fit_algorithm_params": fit_algorithm_params, } # removing daily seasonality terms params_daily["fs_components_df"] = pd.DataFrame({ "name": ["tow", "ct1"], "period": [7.0, 1.0], "order": [4, 5], "seas_names": ["weekly", "yearly"]}) model = SilverkiteEstimator(**params_daily) with pytest.warns(Warning): # suppresses sklearn warning on `iid` parameter for ridge hyperparameter_grid search model.fit(train_df) # Test plot_components with pytest.warns(Warning) as record: title = "Custom component plot" fig = model.plot_components(names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title) expected_rows = 3 assert len(fig.data) == expected_rows + 1 # includes changepoints assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title assert f"The following components have not been specified in the model: " \ f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0] # Test plot_trend title = "Custom trend plot" fig = model.plot_trend(title=title) expected_rows = 2 assert len(fig.data) == expected_rows + 1 # includes changepoints assert [fig.data[i].name for i in range(expected_rows)] == [cst.VALUE_COL, "trend"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.title["text"] == title # Test plot_seasonalities with pytest.warns(Warning): # suppresses the warning on seasonalities removed title = "Custom seasonality plot" fig = model.plot_seasonalities(title=title) expected_rows = 3 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == "Day of week" assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "weekly" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title
def test_get_quantiles_and_overlays(): """Tests get_quantiles_and_overlays""" dl = DataLoaderTS() peyton_manning_ts = dl.load_peyton_manning_ts() # no columns are requested with pytest.raises( ValueError, match= "Must enable at least one of: show_mean, show_quantiles, show_overlays." ): peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy") # show_mean only grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, mean_col_name="custom_name") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays([[MEAN_COL_GROUP], ["custom_name"]], names=["category", "name"])) assert grouped_df.index.name == "dow" assert grouped_df.shape == (7, 1) assert grouped_df.index[0] == 1 # show_quantiles only (bool) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_quantiles=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[QUANTILE_COL_GROUP, QUANTILE_COL_GROUP], ["Q0.1", "Q0.9"]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (17, 2) assert grouped_df.index[0] == pd.Timestamp(2007, 12, 10) # show_quantiles only (list) custom_col = pd.Series( np.random.choice(list("abcd"), size=peyton_manning_ts.df.shape[0])) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_custom_column=custom_col, show_quantiles=[0, 0.25, 0.5, 0.75, 1], quantile_col_prefix="prefix") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[QUANTILE_COL_GROUP] * 5, ["prefix0", "prefix0.25", "prefix0.5", "prefix0.75", "prefix1"]], names=["category", "name"])) assert grouped_df.index.name == "groups" assert grouped_df.shape == (4, 5) assert grouped_df.index[0] == "a" # checks quantile computation df = peyton_manning_ts.df.copy() df["custom_col"] = custom_col.values quantile_df = df.groupby("custom_col")[VALUE_COL].agg( [np.nanmin, np.nanmedian, np.nanmax]) assert_equal(grouped_df["quantile"]["prefix0"], quantile_df["nanmin"], check_names=False) assert_equal(grouped_df["quantile"]["prefix0.5"], quantile_df["nanmedian"], check_names=False) assert_equal(grouped_df["quantile"]["prefix1"], quantile_df["nanmax"], check_names=False) # show_overlays only (bool), no overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy", show_overlays=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 9, [f"overlay{i}" for i in range(9)]], names=["category", "name"])) assert grouped_df.index.name == "doy" assert grouped_df.shape == (366, 9) assert grouped_df.index[0] == 1 # show_overlays only (int below the available number), time feature overlay label np.random.seed(123) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="doy", show_overlays=4, overlay_label_time_feature="year") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 4, ["2007", "2011", "2012", "2014"]], names=["category", "name"])) assert grouped_df.index.name == "doy" assert grouped_df.shape == (366, 4) assert grouped_df.index[0] == 1 # show_overlays only (int above the available number), custom overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=200, overlay_label_custom_column=custom_col) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 4, ["a", "b", "c", "d"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 4) assert grouped_df.index[0] == 1 # show_overlays only (list of indices), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=[0, 4], overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 2, ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 2) assert grouped_df.index[0] == 1 # show_overlays only (np.ndarray), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=np.arange(0, 6, 2), overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 3, [ "2007-12-10 00:00:00", "2011-12-09 00:00:00", "2015-12-08 00:00:00" ]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 3) assert grouped_df.index[0] == 1 # show_overlays only (list of column names), sliding window overlay label grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_time_feature="dom", show_overlays=["2007-12-10 00:00:00", "2015-12-08 00:00:00"], overlay_label_sliding_window_size=365 * 2) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[OVERLAY_COL_GROUP] * 2, ["2007-12-10 00:00:00", "2015-12-08 00:00:00"]], names=["category", "name"])) assert grouped_df.index.name == "dom" assert grouped_df.shape == (31, 2) assert grouped_df.index[0] == 1 # Show all 3 (no overlay label) grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=50, # 50 per group (50 overlays) show_mean=True, show_quantiles=[0.05, 0.5, 0.95], # 3 quantiles show_overlays=True) assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 + [OVERLAY_COL_GROUP] * 50, ["mean", "Q0.05", "Q0.5", "Q0.95"] + [f"overlay{i}" for i in range(50)]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (60, 54) assert grouped_df.index[-1] == pd.Timestamp(2016, 1, 7) # Show all 3 (with overlay label). # Pass overlay_pivot_table_kwargs. grouped_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], # 3 quantiles show_overlays=True, overlay_label_time_feature="dow", # 7 possible values aggfunc="median") assert_equal( grouped_df.columns, pd.MultiIndex.from_arrays( [[MEAN_COL_GROUP] + [QUANTILE_COL_GROUP] * 3 + [OVERLAY_COL_GROUP] * 7, [ "mean", "Q0.05", "Q0.5", "Q0.95", "1", "2", "3", "4", "5", "6", "7" ]], names=["category", "name"])) assert grouped_df.index.name == "ts_downsample" assert grouped_df.shape == (17, 11) assert grouped_df.index[-1] == pd.Timestamp(2015, 10, 29) assert np.linalg.norm( grouped_df[OVERLAY_COL_GROUP].mean()) > 1.0 # not centered with pytest.raises( TypeError, match="pivot_table\\(\\) got an unexpected keyword argument 'aggfc'" ): peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfc=np.nanmedian) # unrecognized parameter # center_values with show_mean=True centered_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=True, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfunc="median", center_values=True) assert np.linalg.norm(centered_df[[MEAN_COL_GROUP, OVERLAY_COL_GROUP ]].mean()) < 1e-8 # centered at 0 assert_equal( centered_df[QUANTILE_COL_GROUP], grouped_df[QUANTILE_COL_GROUP] - grouped_df[MEAN_COL_GROUP].mean()[0]) # center_values with show_mean=False centered_df = peyton_manning_ts.get_quantiles_and_overlays( groupby_sliding_window_size=180, show_mean=False, show_quantiles=[0.05, 0.5, 0.95], show_overlays=True, overlay_label_time_feature="dow", aggfunc="median", center_values=True) assert np.linalg.norm(centered_df[[OVERLAY_COL_GROUP ]].mean()) < 1e-8 # centered at 0 overall_mean = peyton_manning_ts.df[VALUE_COL].mean() assert_equal(centered_df[QUANTILE_COL_GROUP], grouped_df[QUANTILE_COL_GROUP] - overall_mean) # new value_col df = generate_df_with_reg_for_tests(freq="D", periods=700)["df"] ts = UnivariateTimeSeries() ts.load_data(df=df) grouped_df = ts.get_quantiles_and_overlays( groupby_time_feature="dow", show_mean=True, show_quantiles=True, show_overlays=True, overlay_label_time_feature="woy", value_col="regressor1") df_dow = add_groupby_column(df=ts.df, time_col=TIME_COL, groupby_time_feature="dow") dow_mean = df_dow["df"].groupby("dow").agg( mean=pd.NamedAgg(column="regressor1", aggfunc=np.nanmean)) assert_equal(grouped_df["mean"], dow_mean, check_names=False)
def test_get_forecast_time_properties(): """Tests get_forecast_time_properties""" num_training_points = 365 # one year of daily data data = generate_df_for_tests(freq="D", periods=num_training_points) df = data["df"] result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="D", forecast_horizon=0) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_DAY_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.DAY, "num_training_points": num_training_points, "num_training_days": num_training_points, "days_per_observation": 1, "forecast_horizon": 0, "forecast_horizon_in_timedelta": timedelta(days=0), "forecast_horizon_in_days": 0, "start_year": 2018, "end_year": 2019, "origin_for_time_vars": default_origin } # longer forecast_horizon result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="D", forecast_horizon=365) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_DAY_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.DAY, "num_training_points": num_training_points, "num_training_days": num_training_points, "days_per_observation": 1, "forecast_horizon": 365, "forecast_horizon_in_timedelta": timedelta(days=365), "forecast_horizon_in_days": 365, "start_year": 2018, "end_year": 2020, "origin_for_time_vars": default_origin } # two years of hourly data num_training_points = 2 * 365 * 24 data = generate_df_for_tests(freq="H", periods=num_training_points) df = data["df"] result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="H", forecast_horizon=0) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_HOUR_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.HOUR, "num_training_points": num_training_points, "num_training_days": num_training_points / 24, "days_per_observation": 1 / 24, "forecast_horizon": 0, "forecast_horizon_in_timedelta": timedelta(days=0), "forecast_horizon_in_days": 0, "start_year": 2018, "end_year": 2020, "origin_for_time_vars": default_origin } # longer forecast_horizon result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="H", forecast_horizon=365 * 24) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_HOUR_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.HOUR, "num_training_points": num_training_points, "num_training_days": num_training_points / 24, "days_per_observation": 1 / 24, "forecast_horizon": 365 * 24, "forecast_horizon_in_timedelta": timedelta(days=365), "forecast_horizon_in_days": 365, "start_year": 2018, "end_year": 2021, "origin_for_time_vars": default_origin } # ``forecast_horizon=None`` result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="H", forecast_horizon=None) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_HOUR_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.HOUR, "num_training_points": num_training_points, "num_training_days": num_training_points / 24, "days_per_observation": 1 / 24, "forecast_horizon": 24, "forecast_horizon_in_timedelta": timedelta(days=1), "forecast_horizon_in_days": 1, "start_year": 2018, "end_year": 2020, "origin_for_time_vars": default_origin } # weekly df with regressors num_training_points = 50 data = generate_df_with_reg_for_tests(freq="W-SUN", periods=num_training_points, train_start_date=datetime.datetime( 2018, 11, 30), remove_extra_cols=True, mask_test_actuals=True) df = data["df"] train_df = data["train_df"] forecast_horizon = data["fut_time_num"] regressor_cols = [ col for col in df.columns if col not in [TIME_COL, VALUE_COL] ] result = get_forecast_time_properties(df, time_col=TIME_COL, value_col=VALUE_COL, freq="W-SUN", regressor_cols=regressor_cols, forecast_horizon=forecast_horizon) default_origin = get_default_origin_for_time_vars(df, TIME_COL) assert result == { "period": TimeEnum.ONE_WEEK_IN_SECONDS.value, "simple_freq": SimpleTimeFrequencyEnum.WEEK, "num_training_points": train_df.shape[0], # size of training set "num_training_days": train_df.shape[0] * 7, "days_per_observation": 7, "forecast_horizon": 9, "forecast_horizon_in_timedelta": timedelta(days=63), "forecast_horizon_in_days": 63.0, "start_year": 2018, "end_year": 2019, "origin_for_time_vars": default_origin } # checks `num_training_days` with `train_end_date` data = generate_df_with_reg_for_tests(freq="H", periods=300 * 24, train_start_date=datetime.datetime( 2018, 7, 1), remove_extra_cols=True, mask_test_actuals=True) df = data["df"] train_end_date = datetime.datetime(2019, 2, 1) result = get_forecast_time_properties( df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="H", regressor_cols=data["regressor_cols"], train_end_date=train_end_date, forecast_horizon=forecast_horizon) period = 3600 # seconds between observations time_delta = (train_end_date - df[TIME_COL].min() ) # train end - train start num_training_days = ( time_delta.days + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value) assert result["num_training_days"] == num_training_days # checks `num_training_days` without `train_end_date` result = get_forecast_time_properties( df=df, time_col=TIME_COL, value_col=VALUE_COL, freq="H", regressor_cols=data["regressor_cols"], train_end_date=None, forecast_horizon=forecast_horizon) time_delta = ( datetime.datetime(2019, 2, 26) - df[TIME_COL].min() ) # by default, train end is the last date with nonnull value_col num_training_days = ( time_delta.days + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value) assert result["num_training_days"] == num_training_days
def test_run_prophet_template_custom(): """Tests running prophet template through the pipeline""" data = generate_df_with_reg_for_tests(freq="D", periods=50, train_frac=0.8, conti_year_origin=2018, remove_extra_cols=True, mask_test_actuals=True) # select relevant columns for testing relevant_cols = [ cst.TIME_COL, cst.VALUE_COL, "regressor1", "regressor2", "regressor3" ] df = data["df"][relevant_cols] forecast_horizon = data["fut_time_num"] # Model components - custom holidays; other params as defaults model_components = ModelComponentsParam( seasonality={ "seasonality_mode": ["additive"], "yearly_seasonality": ["auto"], "weekly_seasonality": [True], "daily_seasonality": ["auto"], }, growth={"growth_term": ["linear"]}, events={ "holiday_pre_num_days": [1], "holiday_post_num_days": [1], "holidays_prior_scale": [1.0] }, changepoints={ "changepoint_prior_scale": [0.05], "n_changepoints": [1], "changepoint_range": [0.5], }, regressors={ "add_regressor_dict": [{ "regressor1": { "prior_scale": 10, "standardize": True, "mode": "additive" }, "regressor2": { "prior_scale": 15, "standardize": False, "mode": "additive" }, "regressor3": {} }] }, uncertainty={"uncertainty_samples": [10]}) metadata = MetadataParam( time_col=cst.TIME_COL, value_col=cst.VALUE_COL, freq="D", ) evaluation_period = EvaluationPeriodParam( test_horizon=5, # speeds up test case periods_between_train_test=5, cv_horizon=0, # speeds up test case ) config = ForecastConfig( model_template=ModelTemplateEnum.PROPHET.name, metadata_param=metadata, forecast_horizon=forecast_horizon, coverage=0.95, model_components_param=model_components, evaluation_period_param=evaluation_period, ) result = Forecaster().run_forecast_config( df=df, config=config, ) forecast_df = result.forecast.df_test.reset_index(drop=True) expected_cols = [ "ts", "actual", "forecast", "forecast_lower", "forecast_upper" ] assert list(forecast_df.columns) == expected_cols assert result.backtest.coverage == 0.95, "coverage is not correct" # NB: coverage is poor because of very small dataset size and low uncertainty_samples assert result.backtest.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.677, rel=1e-3), \ "training coverage is None or less than expected" assert result.backtest.test_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.800, rel=1e-3), \ "testing coverage is None or less than expected" assert result.backtest.train_evaluation["MSE"] == pytest.approx(3.7849, rel=1e-3), \ "training MSE is None or more than expected" assert result.backtest.test_evaluation["MSE"] == pytest.approx(2.9609, rel=1e-3), \ "testing MSE is None or more than expected" assert result.forecast.train_evaluation[cst.PREDICTION_BAND_COVERAGE] == pytest.approx(0.7805, rel=1e-3), \ "forecast coverage is None or less than expected" assert result.forecast.train_evaluation["MSE"] == pytest.approx(4.1806, rel=1e-3), \ "forecast MSE is None or more than expected" # ensure regressors were used in the model prophet_estimator = result.model.steps[-1][-1] regressors = prophet_estimator.model.extra_regressors assert regressors.keys() == {"regressor1", "regressor2", "regressor3"} assert regressors["regressor1"]["prior_scale"] == 10.0 assert regressors["regressor1"]["standardize"] is True assert regressors["regressor1"]["mode"] == "additive" assert regressors["regressor2"]["prior_scale"] == 15.0 assert regressors["regressor3"]["standardize"] == "auto"
def test_plot_components(): """Tests plot_components. Because component plots are implemented in `base_silverkite_estimator.py,` the bulk of the testing is done there. This file only tests inheritance and compatibility of the trained_model generated by this estimator's fit. """ daily_data = generate_df_with_reg_for_tests( freq="D", periods=20, train_start_date=datetime.datetime(2018, 1, 1), conti_year_origin=2018) train_df = daily_data.get("train_df").copy() model = SimpleSilverkiteEstimator( fit_algorithm_dict={"fit_algorithm": "linear"}, yearly_seasonality=True, quarterly_seasonality=False, monthly_seasonality=False, weekly_seasonality=True, daily_seasonality=False, ) model.fit(train_df) # Test plot_components with pytest.warns(Warning) as record: title = "Custom component plot" fig = model.plot_components( names=["trend", "YEARLY_SEASONALITY", "DUMMY"], title=title) expected_rows = 3 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "trend", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title assert f"The following components have not been specified in the model: " \ f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0] # Test plot_trend title = "Custom trend plot" fig = model.plot_trend(title=title) expected_rows = 2 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == [cst.VALUE_COL, "trend"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.title["text"] == title # Test plot_seasonalities with pytest.warns(Warning): # suppresses the warning on seasonalities removed title = "Custom seasonality plot" fig = model.plot_seasonalities(title=title) expected_rows = 3 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == "Day of week" assert fig.layout.xaxis3.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "weekly" assert fig.layout.yaxis3.title["text"] == "yearly" assert fig.layout.title["text"] == title
def test_run_forecast_config_custom(): """Tests `run_forecast_config` on weekly data with custom config: - numeric and categorical regressors - coverage - null model """ data = generate_df_with_reg_for_tests(freq="W-MON", periods=140, remove_extra_cols=True, mask_test_actuals=True) reg_cols = ["regressor1", "regressor2", "regressor_categ"] keep_cols = [TIME_COL, VALUE_COL] + reg_cols df = data["df"][keep_cols] metric = EvaluationMetricEnum.MeanAbsoluteError evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name, agg_periods=7, agg_func=np.max, null_model_params={ "strategy": "quantile", "constant": None, "quantile": 0.5 }) evaluation_period = EvaluationPeriodParam(test_horizon=10, periods_between_train_test=5, cv_horizon=4, cv_min_train_periods=80, cv_expanding_window=False, cv_periods_between_splits=20, cv_periods_between_train_test=3, cv_max_splits=3) model_components = ModelComponentsParam( regressors={"regressor_cols": reg_cols}, custom={ "fit_algorithm_dict": { "fit_algorithm": "ridge", "fit_algorithm_params": { "cv": 2 } } }) computation = ComputationParam(verbose=2) forecast_horizon = 27 coverage = 0.90 forecast_config = ForecastConfig( model_template=ModelTemplateEnum.SILVERKITE.name, computation_param=computation, coverage=coverage, evaluation_metric_param=evaluation_metric, evaluation_period_param=evaluation_period, forecast_horizon=forecast_horizon, model_components_param=model_components) with warnings.catch_warnings(): warnings.simplefilter("ignore") forecaster = Forecaster() result = forecaster.run_forecast_config(df=df, config=forecast_config) mse = EvaluationMetricEnum.RootMeanSquaredError.get_metric_name() q80 = EvaluationMetricEnum.Quantile80.get_metric_name() assert result.backtest.test_evaluation[mse] == pytest.approx(2.976, rel=1e-2) assert result.backtest.test_evaluation[q80] == pytest.approx(1.360, rel=1e-2) assert result.forecast.train_evaluation[mse] == pytest.approx(2.224, rel=1e-2) assert result.forecast.train_evaluation[q80] == pytest.approx(0.941, rel=1e-2) check_forecast_pipeline_result(result, coverage=coverage, strategy=None, score_func=metric.name, greater_is_better=False) with pytest.raises(KeyError, match="missing_regressor"): model_components = ModelComponentsParam( regressors={"regressor_cols": ["missing_regressor"]}) forecaster = Forecaster() result = forecaster.run_forecast_config( df=df, config=ForecastConfig( model_template=ModelTemplateEnum.SILVERKITE.name, model_components_param=model_components)) check_forecast_pipeline_result(result, coverage=None, strategy=None, score_func=metric.get_metric_func(), greater_is_better=False)
def test_silverkite_with_components_hourly_data(): """Tests get_components, plot_components, plot_trend, plot_seasonalities with hourly data """ hourly_data = generate_df_with_reg_for_tests( freq="H", periods=24 * 4, train_start_date=datetime.datetime(2018, 1, 1), conti_year_origin=2018) train_df = hourly_data.get("train_df").copy() params_hourly = params_components() # converts into parameters for `forecast_silverkite` coverage = params_hourly.pop("coverage") model = BaseSilverkiteEstimator( coverage=coverage, uncertainty_dict=params_hourly["uncertainty_dict"]) model.fit(X=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL) silverkite = SilverkiteForecast() model.model_dict = silverkite.forecast(df=train_df, time_col=cst.TIME_COL, value_col=cst.VALUE_COL, **params_hourly) model.finish_fit() # Test plot_components with pytest.warns(Warning) as record: title = "Custom component plot" fig = model.plot_components( names=["trend", "DAILY_SEASONALITY", "DUMMY"], title=title) expected_rows = 3 + 1 # includes changepoints assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "trend", "DAILY_SEASONALITY", "trend change point"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.xaxis3.title["text"] == "Hour of day" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.yaxis3.title["text"] == "daily" assert fig.layout.title["text"] == title assert f"The following components have not been specified in the model: " \ f"{{'DUMMY'}}, plotting the rest." in record[0].message.args[0] # Test plot_trend title = "Custom trend plot" fig = model.plot_trend(title=title) expected_rows = 2 assert len(fig.data) == expected_rows + 1 # includes changepoints assert [fig.data[i].name for i in range(expected_rows)] == [cst.VALUE_COL, "trend"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == cst.TIME_COL assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "trend" assert fig.layout.title["text"] == title # Test plot_seasonalities with pytest.warns(Warning): # suppresses the warning on seasonalities removed title = "Custom seasonality plot" fig = model.plot_seasonalities(title=title) expected_rows = 4 assert len(fig.data) == expected_rows assert [fig.data[i].name for i in range(expected_rows)] == \ [cst.VALUE_COL, "DAILY_SEASONALITY", "WEEKLY_SEASONALITY", "YEARLY_SEASONALITY"] assert fig.layout.xaxis.title["text"] == cst.TIME_COL assert fig.layout.xaxis2.title["text"] == "Hour of day" assert fig.layout.xaxis3.title["text"] == "Day of week" assert fig.layout.xaxis4.title["text"] == "Time of year" assert fig.layout.yaxis.title["text"] == cst.VALUE_COL assert fig.layout.yaxis2.title["text"] == "daily" assert fig.layout.yaxis3.title["text"] == "weekly" assert fig.layout.yaxis4.title["text"] == "yearly" assert fig.layout.title["text"] == title