示例#1
0
def test_get_integer():
    """Tests get_integer function"""
    with pytest.warns(Warning) as record:
        assert get_integer(None, "val", min_value=10, default_value=20) == 20
        assert get_integer(11, "val", min_value=10, default_value=20) == 11
        assert get_integer(10.5, "val", min_value=10, default_value=20) == 10
        assert "val converted to integer 10 from 10.5" in record[
            0].message.args[0]

    with pytest.raises(ValueError, match="val must be an integer"):
        get_integer("q", "val")

    with pytest.raises(ValueError, match="val must be >= 1"):
        get_integer(0, "val", min_value=1)

    with pytest.raises(ValueError, match="val must be >= 1"):
        get_integer(None, "val", min_value=1, default_value=0)
示例#2
0
    def pipeline_wrapper(
            # The arguments to this wrapper must be identical to forecast_pipeline() function.
            # We don't use **kwargs
            # because it's easier to check parameters directly.
            # input
            df: pd.DataFrame,
            time_col=TIME_COL,
            value_col=VALUE_COL,
            date_format=None,
            tz=None,
            freq=None,
            train_end_date=None,
            anomaly_info=None,
            # model
            pipeline=None,
            regressor_cols=None,
            lagged_regressor_cols=None,
            estimator=SimpleSilverkiteEstimator(),
            hyperparameter_grid=None,
            hyperparameter_budget=None,
            n_jobs=COMPUTATION_N_JOBS,
            verbose=1,
            # forecast
            forecast_horizon=None,
            coverage=0.95,
            test_horizon=None,
            periods_between_train_test=None,
            agg_periods=None,
            agg_func=None,
            # evaluation
            score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
            score_func_greater_is_better=False,
            cv_report_metrics=None,
            null_model_params=None,
            relative_error_tolerance=None,
            # CV
            cv_horizon=None,
            cv_min_train_periods=None,
            cv_expanding_window=False,
            cv_use_most_recent_splits=False,
            cv_periods_between_splits=None,
            cv_periods_between_train_test=0,
            cv_max_splits=3):
        if coverage is not None and (coverage < 0 or coverage > 1):
            raise ValueError(f"coverage must be between 0 and 1, found {coverage}")
        if relative_error_tolerance is not None and relative_error_tolerance < 0:
            raise ValueError(f"relative_error_tolerance must non-negative, found {relative_error_tolerance}")

        # default values for forecast horizon, test, and cross-validation parameters
        period = min_gap_in_seconds(df=df, time_col=time_col)
        num_observations = df.shape[0]
        default_time_params = get_default_time_parameters(
            period=period,
            num_observations=num_observations,
            forecast_horizon=forecast_horizon,
            test_horizon=test_horizon,
            periods_between_train_test=periods_between_train_test,
            cv_horizon=cv_horizon,
            cv_min_train_periods=cv_min_train_periods,
            cv_periods_between_train_test=cv_periods_between_train_test)
        forecast_horizon = default_time_params.get("forecast_horizon")
        test_horizon = default_time_params.get("test_horizon")
        periods_between_train_test = default_time_params.get("periods_between_train_test")
        cv_horizon = default_time_params.get("cv_horizon")
        cv_min_train_periods = default_time_params.get("cv_min_train_periods")
        cv_periods_between_train_test = default_time_params.get("cv_periods_between_train_test")

        # ensures the values are integers in the proper domain
        if hyperparameter_budget is not None:
            hyperparameter_budget = get_integer(
                hyperparameter_budget,
                "hyperparameter_budget",
                min_value=1)

        if (cv_horizon == 0 or cv_max_splits == 0) and test_horizon == 0:
            raise ValueError("Either CV or backtest must be enabled."
                             " Set cv_horizon and cv_max_splits to nonzero values to enable CV."
                             " Set test_horizon to nonzero value to enable backtest."
                             " It's important to check model"
                             " performance on historical data.")

        if test_horizon == 0:
            warnings.warn("No data selected for test (test_horizon=0). "
                          "It is important to check out of sample performance")

        # checks horizon against data size
        if num_observations < forecast_horizon * 2:
            warnings.warn(f"Not enough training data to forecast the full forecast_horizon."
                          " Exercise extra caution with"
                          f" forecasted values after {num_observations // 2} periods.")

        if test_horizon > num_observations:
            raise ValueError(f"test_horizon ({test_horizon}) is too large."
                             " Must be less than the number "
                             f"of input data points: {num_observations})")

        if test_horizon > forecast_horizon:
            warnings.warn(f"test_horizon should never be larger than forecast_horizon.")

        if test_horizon > num_observations // 3:
            warnings.warn(f"test_horizon should be <= than 1/3 of the data set size to allow enough data to train"
                          f" a backtest model. Consider reducing to {num_observations // 3}. If this is smaller"
                          f" than the forecast_horizon, you will need to make a trade-off between setting"
                          f" test_horizon=forecast_horizon and having enough data left over to properly"
                          f" train a realistic backtest model.")

        log_message(f"forecast_horizon: {forecast_horizon}", LoggingLevelEnum.INFO)
        log_message(f"test_horizon: {test_horizon}", LoggingLevelEnum.INFO)
        log_message(f"cv_horizon: {cv_horizon}", LoggingLevelEnum.INFO)

        return pipeline_function(
            df,
            time_col=time_col,
            value_col=value_col,
            date_format=date_format,
            tz=tz,
            freq=freq,
            train_end_date=train_end_date,
            anomaly_info=anomaly_info,
            pipeline=pipeline,
            regressor_cols=regressor_cols,
            lagged_regressor_cols=lagged_regressor_cols,
            estimator=estimator,
            hyperparameter_grid=hyperparameter_grid,
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            forecast_horizon=forecast_horizon,
            coverage=coverage,
            test_horizon=test_horizon,
            periods_between_train_test=periods_between_train_test,
            agg_periods=agg_periods,
            agg_func=agg_func,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            null_model_params=null_model_params,
            relative_error_tolerance=relative_error_tolerance,
            cv_horizon=cv_horizon,
            cv_min_train_periods=cv_min_train_periods,
            cv_expanding_window=cv_expanding_window,
            cv_use_most_recent_splits=cv_use_most_recent_splits,
            cv_periods_between_splits=cv_periods_between_splits,
            cv_periods_between_train_test=cv_periods_between_train_test,
            cv_max_splits=cv_max_splits
        )
示例#3
0
def get_default_time_parameters(
        period,
        num_observations,
        forecast_horizon=None,
        test_horizon=None,
        periods_between_train_test=None,
        cv_horizon=None,
        cv_min_train_periods=None,
        cv_expanding_window=False,
        cv_periods_between_splits=None,
        cv_periods_between_train_test=None,
        cv_max_splits=3):
    """Returns default forecast horizon, backtest, and cross-validation parameters,
    given the input frequency, size, and user requested values.

    This function is called from the `~greykite.framework.pipeline.pipeline.forecast_pipeline`
    directly, to provide suitable default to users of forecast_pipeline, and because the default
    should not depend on model configuration (the template).

    Parameters
    ----------
    period: `float`
        Period of each observation (i.e. average time between observations, in seconds).
    num_observations: `int`
        Number of observations in the input data.
    forecast_horizon: `int` or None, default None
        Number of periods to forecast into the future. Must be > 0.
        If None, default is determined from input data frequency.
    test_horizon: `int` or None, default None
        Numbers of periods held back from end of df for test.
        The rest is used for cross validation.
        If None, default is ``forecast_horizon``. Set to 0 to skip backtest.
    periods_between_train_test : `int` or None, default None
        Number of periods gap between train and test in a CV split.
        If None, default is 0.
    cv_horizon: `int` or None, default None
        Number of periods in each CV test set.
        If None, default is ``forecast_horizon``. Set to 0 to skip CV.
    cv_min_train_periods: `int` or None, default None
        Minimum number of periods for training each CV fold.
        If ``cv_expanding_window`` is False, every training period is this size.
        If None, default is 2 * ``cv_horizon``.
    cv_expanding_window: `bool`, default False
        If True, training window for each CV split is fixed to the first available date.
        Otherwise, train start date is sliding, determined by ``cv_min_train_periods``.
    cv_periods_between_splits: `int` or None, default None
        Number of periods to slide the test window between CV splits
        If None, default is ``cv_horizon``.
    cv_periods_between_train_test: `int` or None, default None
        Number of periods gap between train and test in a CV split.
        If None, default is ``periods_between_train_test``.
    cv_max_splits: `int` or None, default 3
        Maximum number of CV splits. Given the above configuration, samples up to max_splits train/test splits,
        preferring splits toward the end of available data. If None, uses all splits.

    Returns
    -------
    time_params : `dict` [`str`, `str`]
        keys are parameter names, values are their default values.
    """
    if forecast_horizon is None:
        forecast_horizon = get_default_horizon_from_period(
            period=period,
            num_observations=num_observations)
    forecast_horizon = get_integer(val=forecast_horizon, name="forecast_horizon", min_value=1)

    test_horizon = get_integer(
        val=test_horizon,
        name="test_horizon",
        min_value=0,
        default_value=forecast_horizon)
    # reduces test_horizon to default 80/20 split if there is not enough data
    if test_horizon >= num_observations:
        test_horizon = math.floor(num_observations * 0.2)

    cv_horizon = get_integer(
        val=cv_horizon,
        name="cv_horizon",
        min_value=0,
        default_value=forecast_horizon)
    # RollingTimeSeriesSplit handles the case of no CV splits, not handled in detail here
    # temporary patch to avoid the case where cv_horizon==num_observations, which throws an error
    # in RollingTimeSeriesSplit
    if cv_horizon >= num_observations:
        cv_horizon = math.floor(num_observations * 0.2)

    periods_between_train_test = get_integer(
        val=periods_between_train_test,
        name="periods_between_train_test",
        min_value=0,
        default_value=0)

    cv_periods_between_train_test = get_integer(
        val=cv_periods_between_train_test,
        name="cv_periods_between_train_test",
        min_value=0,
        default_value=periods_between_train_test)

    return {
        "forecast_horizon": forecast_horizon,
        "test_horizon": test_horizon,
        "periods_between_train_test": periods_between_train_test,
        "cv_horizon": cv_horizon,
        "cv_min_train_periods": cv_min_train_periods,
        "cv_periods_between_train_test": cv_periods_between_train_test
    }
示例#4
0
    def __init__(self,
                 forecast_horizon,
                 min_train_periods=None,
                 expanding_window=False,
                 use_most_recent_splits=False,
                 periods_between_splits=None,
                 periods_between_train_test=0,
                 max_splits=3):
        """Initializes attributes of RollingTimeSeriesSplit

        Parameters
        ----------
        forecast_horizon : `int`
            How many periods in each CV test set

        min_train_periods : `int` or None, optional
            Minimum number of periods for training.
            If ``expanding_window`` is False, every training period has this size.

        expanding_window : `bool`, default False
            If True, training window for each CV split is fixed to the first available date.
            Otherwise, train start date is sliding, determined by ``min_train_periods``.

        use_most_recent_splits: `bool`, default False
            If True, splits from the end of the dataset are used.
            Else a sampling strategy is applied. Check
            `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit._sample_splits`
            for details.

        periods_between_splits : `int` or None
            Number of periods to slide the test window

        periods_between_train_test : `int`
            Number of periods gap between train and test within a CV split

        max_splits : `int` or None
            Maximum number of CV splits. Given the above configuration, samples up to max_splits train/test splits,
            preferring splits toward the end of available data. If None, uses all splits.
        """
        super().__init__()
        self.forecast_horizon = get_integer(forecast_horizon,
                                            name="forecast_horizon",
                                            min_value=1)

        # by default, use at least twice the forecast horizon for training
        self.min_train_periods = get_integer(min_train_periods,
                                             name="min_train_periods",
                                             min_value=1,
                                             default_value=2 *
                                             self.forecast_horizon)

        # by default, use fixed size training window
        self.expanding_window = expanding_window

        # by default, does not force most recent splits
        self.use_most_recent_splits = use_most_recent_splits

        # by default, use non-overlapping test sets
        self.periods_between_splits = get_integer(
            periods_between_splits,
            name="periods_between_splits",
            min_value=1,
            default_value=self.forecast_horizon)

        # by default, use test set immediately following train set
        self.periods_between_train_test = get_integer(
            periods_between_train_test,
            name="periods_between_train_test",
            min_value=0,
            default_value=0)

        if self.min_train_periods < 2 * self.forecast_horizon:
            warnings.warn(
                f"`min_train_periods` is too small for your `forecast_horizon`. Should be at least"
                f" {forecast_horizon*2}=2*`forecast_horizon`.")

        self.max_splits = max_splits
        self.min_splits = 1  # CV ensures there is always at least one split
        # test end index for the first CV split, before applying offset to ensure last data point in X is used
        self.__starting_test_index = (self.forecast_horizon +
                                      self.min_train_periods +
                                      self.periods_between_train_test - 1)