Exemplo n.º 1
0
# Runs the forecast
result = forecaster.run_forecast_config(
    df=df,
    config=ForecastConfig(
        model_template=ModelTemplateEnum.SILVERKITE.name,
        forecast_horizon=365,  # forecasts 365 steps ahead
        coverage=0.95,  # 95% prediction intervals
        metadata_param=metadata,
        evaluation_period_param=evaluation_period))

# Summarizes the cv result
cv_results = summarize_grid_search_results(
    grid_search=result.grid_search,
    decimals=1,
    # The below saves space in the printed output. Remove to show all available metrics and columns.
    cv_report_metrics=None,
    column_order=[
        "rank", "mean_test", "split_test", "mean_train", "split_train",
        "mean_fit_time", "mean_score_time", "params"
    ])
# Transposes to save space in the printed output
cv_results["params"] = cv_results["params"].astype(str)
cv_results.set_index("params", drop=True, inplace=True)
cv_results.transpose()

# %%
# By default, all metrics in `~greykite.common.evaluation.ElementwiseEvaluationMetricEnum`
# are computed on each CV train/test split.
# The configuration of CV evaluation metrics can be found at
# `Evaluation Metric <../../pages/stepbystep/0400_configuration.html#evaluation-metric>`_.
# Here, we show the Mean Absolute Percentage Error (MAPE)
Exemplo n.º 2
0
def test_run_forecast_config_with_single_simple_silverkite_template():
    # The generic name of single simple silverkite templates are not added to `ModelTemplateEnum`,
    # therefore we test if these are recognized.
    data = generate_df_for_tests(freq="D", periods=365)
    df = data["df"]
    metric = EvaluationMetricEnum.MeanAbsoluteError
    evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name,
                                              agg_periods=7,
                                              agg_func=np.max,
                                              null_model_params={
                                                  "strategy": "quantile",
                                                  "constant": None,
                                                  "quantile": 0.5
                                              })

    evaluation_period = EvaluationPeriodParam(test_horizon=10,
                                              periods_between_train_test=5,
                                              cv_horizon=4,
                                              cv_min_train_periods=80,
                                              cv_expanding_window=False,
                                              cv_periods_between_splits=20,
                                              cv_periods_between_train_test=3,
                                              cv_max_splits=2)

    model_components = ModelComponentsParam(
        hyperparameter_override=[{
            "estimator__yearly_seasonality": 1
        }, {
            "estimator__yearly_seasonality": 2
        }])
    computation = ComputationParam(verbose=2)
    forecast_horizon = 27
    coverage = 0.90

    single_template_class = SimpleSilverkiteTemplateOptions(
        freq=SILVERKITE_COMPONENT_KEYWORDS.FREQ.value.DAILY,
        seas=SILVERKITE_COMPONENT_KEYWORDS.SEAS.value.NONE)

    forecast_config = ForecastConfig(model_template=[
        single_template_class, "DAILY_ALGO_SGD", "SILVERKITE_DAILY_90"
    ],
                                     computation_param=computation,
                                     coverage=coverage,
                                     evaluation_metric_param=evaluation_metric,
                                     evaluation_period_param=evaluation_period,
                                     forecast_horizon=forecast_horizon,
                                     model_components_param=model_components)

    forecaster = Forecaster()
    result = forecaster.run_forecast_config(df=df, config=forecast_config)

    summary = summarize_grid_search_results(result.grid_search)
    # single_template_class is 1 template,
    # "DAILY_ALGO_SGD" is 1 template and "SILVERKITE_DAILY_90" has 4 templates.
    # With 2 items in `hyperparameter_override, there should be a total of 12 cases.
    assert summary.shape[0] == 12

    # Tests functionality for single template class only.
    forecast_config = ForecastConfig(model_template=single_template_class,
                                     computation_param=computation,
                                     coverage=coverage,
                                     evaluation_metric_param=evaluation_metric,
                                     evaluation_period_param=evaluation_period,
                                     forecast_horizon=forecast_horizon)

    forecaster = Forecaster()
    pipeline_parameters = forecaster.apply_forecast_config(
        df=df, config=forecast_config)
    assert_equal(actual=pipeline_parameters["hyperparameter_grid"],
                 expected={
                     "estimator__time_properties": [None],
                     "estimator__origin_for_time_vars": [None],
                     "estimator__train_test_thresh": [None],
                     "estimator__training_fraction": [None],
                     "estimator__fit_algorithm_dict": [{
                         "fit_algorithm":
                         "linear",
                         "fit_algorithm_params":
                         None
                     }],
                     "estimator__holidays_to_model_separately": [[]],
                     "estimator__holiday_lookup_countries": [[]],
                     "estimator__holiday_pre_num_days": [0],
                     "estimator__holiday_post_num_days": [0],
                     "estimator__holiday_pre_post_num_dict": [None],
                     "estimator__daily_event_df_dict": [None],
                     "estimator__changepoints_dict": [None],
                     "estimator__seasonality_changepoints_dict": [None],
                     "estimator__yearly_seasonality": [0],
                     "estimator__quarterly_seasonality": [0],
                     "estimator__monthly_seasonality": [0],
                     "estimator__weekly_seasonality": [0],
                     "estimator__daily_seasonality": [0],
                     "estimator__max_daily_seas_interaction_order": [0],
                     "estimator__max_weekly_seas_interaction_order": [2],
                     "estimator__autoreg_dict": [None],
                     "estimator__min_admissible_value": [None],
                     "estimator__max_admissible_value": [None],
                     "estimator__uncertainty_dict": [None],
                     "estimator__growth_term": ["linear"],
                     "estimator__regressor_cols": [[]],
                     "estimator__feature_sets_enabled": [False],
                     "estimator__extra_pred_cols": [[]]
                 },
                 ignore_keys={"estimator__time_properties": None})
def assert_proper_grid_search(
        grid_search,
        expected_grid_size=None,
        lower_bound=None,
        upper_bound=None,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        greater_is_better=False,
        cv_report_metrics_names=None):
    """Checks fitted hyperparameter grid search result.

    Parameters
    ----------
    grid_search : `sklearn.model_selection.RandomizedSearchCV`
        Fitted RandomizedSearchCV object
    expected_grid_size : `int` or None, default None
        Expected number of options evaluated in grid search.
        If None, does not check the expected size.
    lower_bound : `float` or None, default None
        Lower bound on CV test set error.
        If None, does not check the test error.
    upper_bound : `float` or None, default None
        Upper bound on CV test set error.
        If None, does not check the test error.
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        The same as passed to ``forecast_pipeline`` and grid search.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.
    greater_is_better : `bool`, default False
        Whether higher values are better.
        Must be explicitly passed for testing (not derived from ``score_func``).
    cv_report_metrics_names : `list` [`str`] or None, default None
        Additional metrics besides ``metric`` calculated during CV.
        If None, no other metrics are checked in the result.

        Unlike in ``forecast_pipeline``, these are the expected names,
        in the CV output, such as:

            - ``enum.get_metric_name()``
            - ``"CUSTOM_SCORE_FUNC_NAME"``
            - ``"FRACTION_OUTSIDE_TOLERANCE_NAME"``

    Raises
    ------
    AssertionError
        If grid search did not run as expected.
    """
    _, _, short_name = get_score_func_with_aggregation(
        score_func=score_func,  # string or callable
        greater_is_better=greater_is_better,
        # Dummy value, doesn't matter because we ignore the returned `score_func`
        relative_error_tolerance=0.01)
    # attributes are populated
    assert hasattr(grid_search, "best_estimator_")
    assert hasattr(grid_search, "cv_results_")
    if callable(grid_search.refit):
        # `grid_search.refit` is a callable if `grid_search` comes from
        # `forecast_pipeline`.
        # Checks if `best_index_` and `refit` match `metric` and `greater_is_better`.
        assert grid_search.best_index_ == grid_search.refit(
            grid_search.cv_results_)
        split_scores = grid_search.cv_results_[f"mean_test_{short_name}"]
        expected_best_score = max(split_scores) if greater_is_better else min(
            split_scores)
        assert split_scores[grid_search.best_index_] == expected_best_score
        assert split_scores[grid_search.best_index_] is not None
        assert not np.isnan(split_scores[grid_search.best_index_])
        assert_refit(grid_search.refit,
                     expected_metric=short_name,
                     expected_greater_is_better=greater_is_better)
    elif grid_search.refit is True:
        # In single metric evaluation, refit_metric is "score".
        short_name = "score"
        # `best_score_` is populated, and the optimal score is the highest
        # test set score. Metrics where `greater_is_better=False` are
        # assumed to be negated in the ``scoring`` parameter so that
        # higher values are better.
        assert hasattr(grid_search, "best_score_")
        best_score = grid_search.best_score_
        test_scores = grid_search.cv_results_[f"mean_test_{short_name}"]
        best_score2 = test_scores[grid_search.best_index_]
        assert best_score == max(test_scores)
        assert best_score2 == max(test_scores)

    if expected_grid_size is not None:
        assert len(grid_search.cv_results_[f"mean_test_{short_name}"]
                   ) == expected_grid_size
    # Parameters are populated
    assert_equal(grid_search.cv_results_["params"][grid_search.best_index_],
                 grid_search.best_params_)

    # All metrics are computed
    if cv_report_metrics_names is None:
        cv_report_metrics_names = []
    for expected_metric in cv_report_metrics_names + [short_name]:
        assert f"mean_test_{expected_metric}" in grid_search.cv_results_.keys()
        assert f"std_test_{expected_metric}" in grid_search.cv_results_.keys()
        assert f"mean_train_{expected_metric}" in grid_search.cv_results_.keys(
        )
        assert f"std_train_{expected_metric}" in grid_search.cv_results_.keys()

    if lower_bound is not None or upper_bound is not None:
        grid_results = summarize_grid_search_results(grid_search,
                                                     score_func=score_func)
        if lower_bound is not None:
            assert all(grid_results[f"mean_test_{short_name}"] >= lower_bound)
        if upper_bound is not None:
            assert all(grid_results[f"mean_test_{short_name}"] <= upper_bound)
def check_forecast_pipeline_result(
        result,
        coverage=0.95,
        strategy=None,
        interactive=False,
        expected_grid_size=None,
        lower_bound_cv=None,
        upper_bound_cv=None,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        greater_is_better=False,
        cv_report_metrics_names=None,
        relative_error_tolerance=None):
    """Helper function that validates forecast_pipeline output.
    Raises an AssertionError is results do not match the expected values.

    Parameters
    ----------
    result : :class:`~greykite.framework.pipeline.pipeline.ForecastResult`
        ``forecast_pipeline`` output to check
    coverage : `float` or None, default 0.95
        The ``coverage`` passed to ``forecast_pipeline``
    strategy : `str` or None, default None
        Null model strategy.
        If None, not checked.
    interactive : `bool`, default False
        Whether to plot and print results.
    expected_grid_size : `int` or None, default None
        Expected number of options evaluated in grid search.
        If None, does not check the expected size.
    lower_bound_cv : `float` or None, default None
        Lower bound on CV test set error.
        If None, does not check the test error.
    upper_bound_cv : `float` or None, default None
        Upper bound on CV test set error.
        If None, does not check the test error.
    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        The same as passed to ``forecast_pipeline`` and grid search.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.
    greater_is_better : `bool`, default False
        Whether higher values are better.
        Must be explicitly passed for testing (not derived from ``score_func``).
    cv_report_metrics_names : `list` [`str`] or None, default None
        Additional metrics besides ``metric`` calculated during CV.
        If None, no other metrics are checked in the result.

        Unlike in ``forecast_pipeline``, these are the expected names,
        in the CV output, such as:

            - ``enum.get_metric_name()``
            - ``"CUSTOM_SCORE_FUNC_NAME"``
            - ``"FRACTION_OUTSIDE_TOLERANCE_NAME"``
    relative_error_tolerance : `float` or None
        The ``relative_error_tolerance`` passed to ``forecast_pipeline``
    """
    assert isinstance(result.grid_search, RandomizedSearchCV)
    assert isinstance(result.model, Pipeline)
    assert isinstance(result.backtest, UnivariateForecast)
    assert isinstance(result.forecast, UnivariateForecast)

    assert_proper_grid_search(result.grid_search,
                              expected_grid_size=expected_grid_size,
                              lower_bound=lower_bound_cv,
                              upper_bound=upper_bound_cv,
                              score_func=score_func,
                              greater_is_better=greater_is_better,
                              cv_report_metrics_names=cv_report_metrics_names)

    ts = result.timeseries
    assert ts.df[VALUE_COL].equals(ts.y)
    assert result.backtest.train_evaluation is not None
    assert result.backtest.test_evaluation is not None
    if coverage is None:
        assert result.forecast.coverage is None
        assert result.backtest.coverage is None
        assert result.backtest.train_evaluation[
            PREDICTION_BAND_COVERAGE] is None
        assert result.backtest.test_evaluation[PREDICTION_BAND_COVERAGE] is None
        expected_cols = [TIME_COL, ACTUAL_COL, PREDICTED_COL]
        assert list(result.backtest.df.columns) == expected_cols
        assert list(result.forecast.df.columns) == expected_cols
    else:
        assert round(result.forecast.coverage, 3) == round(coverage, 3)
        assert round(result.backtest.coverage, 3) == round(coverage, 3)
        assert result.backtest.train_evaluation[
            PREDICTION_BAND_COVERAGE] is not None
        assert result.backtest.test_evaluation[
            PREDICTION_BAND_COVERAGE] is not None

    assert result.forecast.train_evaluation is not None

    # Tests if null model params are set for CV
    estimator = result.model.steps[-1][-1]
    if estimator.null_model is not None and strategy is not None:
        assert estimator.null_model.strategy == strategy

    # Tests if relative_error_tolerance is set for backtest/forecast
    if relative_error_tolerance is not None:
        assert result.backtest.relative_error_tolerance == relative_error_tolerance
        assert result.forecast.relative_error_tolerance == relative_error_tolerance

    if interactive:
        print("backtest_train_evaluation", result.backtest.train_evaluation)
        print("backtest_test_evaluation", result.backtest.test_evaluation)
        print("forecast_train_evaluation", result.forecast.train_evaluation)
        print("forecast_test_evaluation", result.forecast.test_evaluation)
        print(
            summarize_grid_search_results(
                result.grid_search,
                score_func=score_func,
                score_func_greater_is_better=greater_is_better))
        plotly.offline.plot(ts.plot())
        plotly.offline.plot(result.backtest.plot())
        plotly.offline.plot(result.forecast.plot())
Exemplo n.º 5
0
def test_summarize_grid_search_results(pipeline_results):
    """Tests summarize_grid_search_results"""
    # Tests EvaluationMetricEnum `score_func`, `cv_report_metrics=CV_REPORT_METRICS_ALL`
    grid_search = pipeline_results["1"].grid_search
    metric = EvaluationMetricEnum.MeanAbsolutePercentError
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        combine_splits=True,
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better())
    assert cv_result.shape == (4, 60)
    # The proper scores are extracted
    short_name = metric.get_metric_name()
    expected = grid_search.cv_results_[f"mean_test_{short_name}"]
    assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected)
    # Rank direction is correct
    assert cv_result[f"rank_test_{short_name}"].idxmin(
    ) == cv_result[f"mean_test_{short_name}"].idxmin()
    assert all(cv_result[f"mean_test_{short_name}"] > 0)
    assert [
        "rank_test_MAE", "rank_test_MSE", "rank_test_MedAPE", "rank_test_MAPE",
        "mean_test_MAE", "mean_test_MSE", "mean_test_MedAPE", "mean_test_MAPE",
        "split_test_MAPE", "split_test_MSE", "split_test_MAE",
        "split_test_MedAPE", "mean_train_MAE", "mean_train_MSE",
        "mean_train_MedAPE", "mean_train_MAPE", "params",
        "param_estimator__strategy", "param_estimator__quantile",
        "param_estimator__constant", "split_train_MAPE", "split_train_MSE",
        "split_train_MAE", "split_train_MedAPE", "mean_fit_time",
        "std_fit_time", "mean_score_time", "std_score_time", "split0_test_MAE",
        "split1_test_MAE", "split2_test_MAE", "std_test_MAE",
        "split0_train_MAE", "split1_train_MAE", "split2_train_MAE",
        "std_train_MAE", "split0_test_MSE", "split1_test_MSE",
        "split2_test_MSE", "std_test_MSE", "split0_train_MSE",
        "split1_train_MSE", "split2_train_MSE", "std_train_MSE",
        "split0_test_MedAPE", "split1_test_MedAPE", "split2_test_MedAPE",
        "std_test_MedAPE", "split0_train_MedAPE", "split1_train_MedAPE",
        "split2_train_MedAPE", "std_train_MedAPE", "split0_test_MAPE",
        "split1_test_MAPE", "split2_test_MAPE", "std_test_MAPE",
        "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE",
        "std_train_MAPE"
    ] == list(cv_result.columns)

    # `combine_splits=False`
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        combine_splits=False,
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=CV_REPORT_METRICS_ALL)
    assert cv_result.shape == (4, 52
                               )  # no train/test split summary for 4 metrics
    assert "split_test_MedAPE" not in cv_result.columns

    # cv_report_metrics=list, different column_order
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        combine_splits=False,
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=[EvaluationMetricEnum.MeanSquaredError.name],
        column_order=["mean", "time", ".*"])
    assert cv_result.shape == (4, 30)  # only two metrics in the summary
    assert [
        "mean_fit_time", "mean_score_time", "mean_test_MSE", "mean_train_MSE",
        "mean_test_MAPE", "mean_train_MAPE", "std_fit_time", "std_score_time",
        "param_estimator__strategy", "param_estimator__quantile",
        "param_estimator__constant", "params", "split0_test_MSE",
        "split1_test_MSE", "split2_test_MSE", "std_test_MSE", "rank_test_MSE",
        "split0_train_MSE", "split1_train_MSE", "split2_train_MSE",
        "std_train_MSE", "split0_test_MAPE", "split1_test_MAPE",
        "split2_test_MAPE", "std_test_MAPE", "rank_test_MAPE",
        "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE",
        "std_train_MAPE"
    ] == list(cv_result.columns)
    # These metrics are computed but not requested in summary
    assert "rank_test_MedAPE" not in cv_result.columns
    assert "mean_test_MAE" not in cv_result.columns

    # cv_report_metrics=None, different column order
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        combine_splits=False,
        score_func=metric.name,
        score_func_greater_is_better=metric.get_metric_greater_is_better(),
        cv_report_metrics=None,
        column_order=["split", "rank", "mean", "params"])
    assert cv_result.shape == (4, 12)  # only one metric in the summary
    assert [
        "split0_test_MAPE", "split1_test_MAPE", "split2_test_MAPE",
        "split0_train_MAPE", "split1_train_MAPE", "split2_train_MAPE",
        "rank_test_MAPE", "mean_fit_time", "mean_score_time", "mean_test_MAPE",
        "mean_train_MAPE", "params"
    ] == list(cv_result.columns)
    assert "rank_test_MSE" not in cv_result.columns

    # Tests FRACTION_OUTSIDE_TOLERANCE `score_func`
    grid_search = pipeline_results["2"].grid_search
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        score_func=FRACTION_OUTSIDE_TOLERANCE,
        score_func_greater_is_better=False)
    assert cv_result.shape == (4, 242)
    # The proper scores are extracted
    short_name = FRACTION_OUTSIDE_TOLERANCE_NAME
    expected = grid_search.cv_results_[f"mean_test_{short_name}"]
    assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected)
    # Rank direction is correct
    assert cv_result[f"rank_test_{short_name}"].idxmin(
    ) == cv_result[f"mean_test_{short_name}"].idxmin()
    assert all(cv_result[f"mean_test_{short_name}"] > 0)

    # Tests callable `score_func`, greater_is_better=True, split scores
    grid_search = pipeline_results["3"].grid_search
    cv_max_splits = 2
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=True,
        score_func=mean_absolute_error,
        score_func_greater_is_better=True)
    assert cv_result.shape == (4, 20)
    # the proper scores are extracted
    short_name = CUSTOM_SCORE_FUNC_NAME
    expected = grid_search.cv_results_[f"mean_test_{short_name}"]
    assert_equal(np.array(cv_result[f"mean_test_{short_name}"]), expected)
    # Rank direction is correct
    assert cv_result[f"rank_test_{short_name}"].idxmin(
    ) == cv_result[f"mean_test_{short_name}"].idxmax()  # NB: max
    assert all(cv_result[f"mean_test_{short_name}"] > 0)
    assert len(cv_result["params"]
               [0]) == 2  # two params have multiple options in the grid
    assert len(cv_result[f"split_test_{short_name}"][0]) == cv_max_splits
    # no rounding is applied
    assert cv_result[f"mean_test_{short_name}"][1] == pytest.approx(2.430402,
                                                                    rel=1e-5)
    assert cv_result[f"mean_train_{short_name}"][1] == pytest.approx(1.839883,
                                                                     rel=1e-5)
    assert cv_result[f"std_test_{short_name}"][1] == pytest.approx(0.16548,
                                                                   rel=1e-5)
    assert cv_result[f"split_test_{short_name}"][1][0] == pytest.approx(
        2.26492, rel=1e-5)
    assert cv_result[f"split_train_{short_name}"][1][0] == pytest.approx(
        1.84082, rel=1e-5)
    expected = grid_search.cv_results_
    for k, v in cv_result.items():
        if k in expected and k not in ("params", f"rank_test_{short_name}"):
            assert_equal(pd.Series(expected[k], name=k), v)

    # decimals=2, and only_changing_params=False
    cv_result = summarize_grid_search_results(
        grid_search=grid_search,
        only_changing_params=False,
        decimals=2,
        score_func=mean_absolute_error,
        score_func_greater_is_better=False)
    assert cv_result.shape == (4, 20)
    # only_changing_params=False, so all params in hyperparameter_grid are included
    assert len(cv_result["params"][0]) == 4
    # rounding is applied
    assert cv_result[f"mean_test_{short_name}"][1] == 2.43
    assert cv_result[f"mean_train_{short_name}"][1] == 1.84
    assert cv_result[f"std_test_{short_name}"][1] == 0.17
    assert cv_result[f"split_test_{short_name}"][1][0] == 2.26
    assert cv_result[f"split_train_{short_name}"][1][0] == 1.84
Exemplo n.º 6
0
def test_forecast_pipeline_rolling_evaluation_prophet():
    """Checks the output rolling evaluation with Prophet template"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    reg_cols = ["regressor1", "regressor2", "regressor3"]
    keep_cols = [TIME_COL, VALUE_COL] + reg_cols
    df = data["df"][keep_cols]

    hyperparameter_grid = {
        "estimator__weekly_seasonality": [True],
        "estimator__daily_seasonality": [True, False],
        "estimator__n_changepoints":
        [0],  # to speed up test case, remove for better fit
        "estimator__uncertainty_samples": [10],  # to speed up test case
        "estimator__add_regressor_dict": [{
            "regressor1": {
                "prior_scale": 10,
                "standardize": True,
                "mode": 'additive'
            },
            "regressor2": {
                "prior_scale": 15,
                "standardize": False,
                "mode": 'additive'
            },
            "regressor3": {}
        }]
    }
    pipeline_params = mock_pipeline(
        df=df,
        forecast_horizon=3,
        regressor_cols=["regressor1", "regressor2", "regressor3"],
        estimator=ProphetEstimator(),
        hyperparameter_grid=hyperparameter_grid)
    tscv = RollingTimeSeriesSplit(forecast_horizon=3,
                                  expanding_window=True,
                                  max_splits=1)
    rolling_evaluation = forecast_pipeline_rolling_evaluation(
        pipeline_params=pipeline_params, tscv=tscv)

    expected_splits_n = tscv.max_splits
    assert len(rolling_evaluation.keys()) == expected_splits_n
    assert set(rolling_evaluation.keys()) == {"split_0"}

    split0_output = rolling_evaluation["split_0"]
    assert round(split0_output["runtime_sec"],
                 3) == split0_output["runtime_sec"]

    pipeline_result = split0_output["pipeline_result"]
    # Calculates expected pipeline
    train, test = list(tscv.split(X=df))[0]
    df_train = df.loc[train]
    pipeline_params_updated = pipeline_params
    pipeline_params_updated["test_horizon"] = 0
    pipeline_params_updated["df"] = df_train
    expected_pipeline_result = forecast_pipeline(**pipeline_params_updated)

    assert pipeline_result.backtest is None
    # Checks output is identical when there is only 1 split
    pipeline_grid_search = summarize_grid_search_results(
        pipeline_result.grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name)
    expected_grid_search = summarize_grid_search_results(
        expected_pipeline_result.grid_search,
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name)
    assert_equal(pipeline_grid_search["mean_test_MAPE"],
                 expected_grid_search["mean_test_MAPE"])
    assert_equal(pipeline_result.grid_search.cv.__dict__,
                 expected_pipeline_result.grid_search.cv.__dict__)
    # Checks forecast df has the correct number of rows
    expected_rows = pipeline_result.timeseries.fit_df.shape[
        0] + tscv.forecast_horizon
    assert pipeline_result.forecast.df.shape[0] == expected_rows