예제 #1
0
    def transform(self, X):
        """Replaces outliers with NaN.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Data to transform. e.g. each column is a timeseries.
            Columns are expected to be numeric.

        Returns
        -------
        X_outlier : `pandas.DataFrame`
            A copy of the data frame with original values and outliers replaced with NaN.
        """
        assert isinstance(X, pd.DataFrame)
        result = X.copy()
        if self.z_cutoff is not None:
            if self.use_fit_baseline:
                if self._is_fitted is None:
                    raise NotFittedError(
                        "This instance is not fitted yet. Call 'fit' with appropriate arguments "
                        "before calling 'transform'.")
                mean = self.mean
                std = self.std
            else:
                mean = X.mean()
                std = X.std()
            outlier_indices = np.abs(X - mean) > std * self.z_cutoff
            if np.any(outlier_indices):
                total_na = outlier_indices.sum().sum()
                log_message(f"Detected {total_na} outlier(s).",
                            LoggingLevelEnum.INFO)
            result = result.mask(outlier_indices)
        return result
예제 #2
0
    def summary(self):
        """Creates human readable string of how the model works, including relevant diagnostics
        These details cannot be extracted from the forecast alone
        Prints model configuration. Extend this in child class to print the trained model parameters.

        Log message is printed to the cst.LOGGER_NAME logger.
        """
        log_message(self,
                    LoggingLevelEnum.DEBUG)  # print model input parameters
예제 #3
0
    def summary(self):
        """Prints input parameters and Prophet model parameters.

        Returns
        -------
        log_message : str
            log message printed to logging.info()
        """
        super().summary()
        if self.model is not None:
            log_message(pprint(self.model.params), LoggingLevelEnum.INFO)
예제 #4
0
    def transform(self, X):
        """Imputes missing values in input time series.

        Checks the % of data points that are null, and provides warning if
        it exceeds ``self.max_frac``.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Data to transform. e.g. each column is a timeseries.
            Columns are expected to be numeric.

        Returns
        -------
        X_imputed : `pandas.DataFrame`
            A copy of the data frame with original values and missing values imputed
        """
        if self._is_fitted is None:
            raise NotFittedError(
                "This instance is not fitted yet. Call 'fit' with appropriate arguments "
                "before calling 'transform'.")
        assert isinstance(X, pd.DataFrame)

        self.null_frac = X.isna().mean()  # fraction of NaNs in each column
        if np.any(self.null_frac > self.max_frac):
            warnings.warn(f"Input data has many null values. Missing {self.null_frac.max():.2%} of one input.",
                          RuntimeWarning)
        if any(self.null_frac > 0.0):
            log_message(f"Missing data detected: {self.null_frac.mean():.2%} of all input values "
                        f"are null. (If future external regressor(s) are used, some missing values in "
                        f"`value_col` are expected.)",
                        LoggingLevelEnum.INFO)

        if self.impute_algorithm is not None:
            if self.impute_algorithm == "interpolate":
                # Uses `pandas.DataFrame.interpolate`
                X_imputed = X.interpolate(**self.impute_params)
            elif self.impute_algorithm == "ts_interpolate":
                # Uses `impute_with_lags_multi`
                impute_info = impute_with_lags_multi(df=X, **self.impute_params)
                X_imputed = impute_info["df"]
                self.missing_info = impute_info["missing_info"]
            else:
                raise ValueError(f"`impute_algorithm` '{self.impute_algorithm}' is not recognized."
                                 f"Must be one of 'ts_interpolate', 'interpolate'")

            if self.impute_all:
                # A second pass is taken to make sure there are no NaNs.
                X_imputed = X_imputed.interpolate(**DEFAULT_PARAMS["interpolate"])
        else:
            # no-op
            X_imputed = X.copy()
        return X_imputed
예제 #5
0
def fill_missing_dates(df, time_col=TIME_COL, freq=None):
    """Looks for gaps in df[time_col] and returns a pandas.DataFrame
        with the missing rows added in.
        Warning: if freq doesn't match intended freq, then values may be removed.

    Parameters
    ----------
    df : `pandas.DataFrame`
        dataframe with column ``time_col``
    time_col: `str`
        time column name, default TIME_COL
    freq: `str`
        timeseries frequency,
        DateOffset alias, default None (automatically inferred)

    Returns
    -------
    full_df : `pandas.DataFrame`
        ``df`` with rows added for missing timestamps
    added_timepoints : `int`
        The number of rows added to ``df``
    dropped_timepoints : `int`
        The number of rows removed from ``df``.
        If the timestamps in ``df`` are not evenly spaced,
        irregular timestamps may be removed.
    """
    freq = freq if freq is not None else pd.infer_freq(df[time_col])
    df = df.reset_index(drop=True)
    complete_dates = pd.DataFrame({
        time_col:
        pd.date_range(start=min(df[time_col]),
                      end=max(df[time_col]),
                      freq=freq)
    })
    full_df = pd.merge(complete_dates, df, how="left", on=time_col)

    # counts the timestamps in one but not the other
    before = set(df[time_col].values)
    after = set(full_df[time_col].values)
    added_timepoints = len(after - before)
    dropped_timepoints = len(before - after)
    if added_timepoints > 0:
        log_message(
            f"Added {added_timepoints} missing dates. There were {len(before)} values originally.",
            LoggingLevelEnum.INFO)
    if dropped_timepoints > 0:
        warnings.warn(
            f"Dropped {dropped_timepoints} dates when filling gaps in input data. Provide data frequency"
            f" and make sure data points are evenly spaced.")

    return full_df, added_timepoints, dropped_timepoints
예제 #6
0
def aggregate_array(ts_values, agg_periods=7, agg_func=np.sum):
    """Aggregates input array.
    Divides array from left to right into bins of size agg_periods, and applies agg_func to each block.
    Drops records from the left if needed to ensure all bins are full.

    :param ts_values: list, np.array, or pd.Series to aggregate
    :param agg_periods: number of periods to combine in aggregation
    :param agg_func: aggregation function, e.g. np.max, np.sum. Must take an array and returns a number
    :return: array, aggregated so that every agg_periods periods are combined into one

    Examples:
    >>> aggregate_array([1.0, 2.0, 3.0, 4.0], 2, np.sum)
    array([3., 7.])
    >>> aggregate_array(pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]), 2, np.sum)
    array([5., 9.])
    >>> aggregate_array(np.array([1.0, 2.0, 3.0, 4.0, 5.0]), 2, np.max)
    array([3., 5.])
    """
    ts_values = np.array(ts_values)
    n_periods = len(ts_values)

    drop_first_periods = n_periods % agg_periods  # drop these periods from the front, to ensure all bins are full
    if drop_first_periods == n_periods:
        drop_first_periods = 0
        warnings.warn(
            f"Requested agg_periods={agg_periods}, but there are only {n_periods}. Using all for aggregation"
        )
    elif drop_first_periods > 0:
        log_message(
            f"Requested agg_periods={agg_periods} for data of length {n_periods}. Dropping first"
            f" {drop_first_periods} records before aggregation",
            LoggingLevelEnum.INFO)

    # creates dummy time index for aggregation
    dates = pd.date_range("2018-01-01",
                          periods=n_periods - drop_first_periods,
                          freq="1D")
    ts = pd.Series(ts_values[drop_first_periods:], index=dates)
    aggregated_array = ts.resample(f"{agg_periods}D", closed="left") \
        .agg(lambda x: agg_func(x)) \
        .values
    return aggregated_array
예제 #7
0
    def predict(self, X, y=None):
        """Creates forecast for dates specified in X

        To enable caching, every subclass must call this at the beginning
        of its ``.predict()``. Before returning the result, the subclass
        ``.predict()`` must set ``self.cached_predictions_`` to the return value.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Input timeseries with timestamp column and any additional regressors.
            Timestamps are the dates for prediction.
            Value column, if provided in X, is ignored.
        y : ignored

        Returns
        -------
        predictions : `pandas.DataFrame`
            Forecasted values for the dates in X. Columns:

                - TIME_COL dates
                - PREDICTED_COL predictions
                - PREDICTED_LOWER_COL lower bound of predictions, optional
                - PREDICTED_UPPER_COL upper bound of predictions, optional
                - [other columns], optional

            ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present
            if ``self.coverage`` is not None.
        """
        if self.cached_predictions_ is not None and X.equals(
                self.last_predicted_X_):
            log_message("Returning cached predictions.",
                        LoggingLevelEnum.DEBUG)
            return self.cached_predictions_
        else:
            # Updates `last_predicted_X` to the new value.
            # To enable caching, the subclass must set
            # `self.cached_predictions` to the returned result.
            self.last_predicted_X_ = X
            return None
예제 #8
0
def get_hyperparameter_searcher(
        hyperparameter_grid,
        model,
        cv=None,
        hyperparameter_budget=None,
        n_jobs=1,
        verbose=1,
        **kwargs) -> RandomizedSearchCV:
    """Returns RandomizedSearchCV object for hyperparameter tuning via cross validation

    `sklearn.model_selection.RandomizedSearchCV` runs a full grid search if
    ``hyperparameter_budget`` is sufficient to exhaust the full
    ``hyperparameter_grid``, otherwise it samples uniformly at random from the space.

    Parameters
    ----------
    hyperparameter_grid : `dict` or `list` [`dict`]
        Dictionary with parameters names (string) as keys and distributions
        or lists of parameters to try. Distributions must provide a ``rvs``
        method for sampling (such as those from scipy.stats.distributions).
        Lists of parameters are sampled uniformly.

        May also be a list of such dictionaries to avoid undesired combinations of parameters.
        Passed as ``param_distributions`` to `sklearn.model_selection.RandomizedSearchCV`,
        see docs for more info.
    model: estimator object
        A object of that type is instantiated for each grid point. This is assumed to implement
        the scikit-learn estimator interface.
    cv: `int`, cross-validation generator, iterable, or None, default None
        Determines the cross-validation splitting strategy.
        See `sklearn.model_selection.RandomizedSearchCV`.
    hyperparameter_budget: `int` or None, default None
        max number of hyperparameter sets to try within the hyperparameter_grid search space
        If None, uses defaults:

            * exhaustive grid search if all values are constant
            * 10 if any value is a distribution to sample from

    n_jobs : `int` or None, default 1
        Number of jobs to run in parallel
        (the maximum number of concurrently running workers).
        ``-1`` uses all CPUs. ``-2`` uses all CPUs but one.
        ``None`` is treated as 1 unless in a `joblib.Parallel` backend context
        that specifies otherwise.
    verbose : `int`, default 1
        Verbosity level during CV.

        * if > 0, prints number of fits
        * if > 1, prints fit parameters, total score + fit time
        * if > 2, prints train/test scores
    kwargs : additional parameters
        Keyword arguments to pass to `~greykite.framework.pipeline.utils.get_scoring_and_refit`.
        Accepts the following parameters:

            - ``"score_func"``
            - ``"score_func_greater_is_better"``
            - ``"cv_report_metrics"``
            - ``"agg_periods"``
            - ``"agg_func"``
            - ``"relative_error_tolerance"``

    Returns
    -------
    grid_search : `sklearn.model_selection.RandomizedSearchCV`
        Object that can run randomized search on hyper parameters.
    """
    if hyperparameter_budget is None:
        # sets reasonable defaults when hyperparameter_budget is not provided
        try:
            # exhaustive search if explicit values are provided
            hyperparameter_budget = len(ParameterGrid(hyperparameter_grid))
            log_message(f"Setting hyperparameter_budget to {hyperparameter_budget} for full grid search.",
                        LoggingLevelEnum.DEBUG)
        except TypeError:  # parameter value is not iterable
            # sets budget to 10 if distribution for randomized search is provided
            hyperparameter_budget = 10
            log_message(f"Setting hyperparameter_budget to {hyperparameter_budget} to sample from"
                        f" provided distributions (and lists).", LoggingLevelEnum.WARNING)

    scoring, refit = get_scoring_and_refit(**kwargs)

    # note: RandomizedSearchCV operates like GridSearchCV when hyperparameter_grid contains no distributions
    grid_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=hyperparameter_grid,  # a fixed list or distribution to sample from
        n_iter=hyperparameter_budget,             # samples uniformly, up to hyperparameter_budget
        scoring=scoring,                          # model evaluation criteria (note: if None, uses the score function of the estimator)
        n_jobs=n_jobs,                            # parallelism
        refit=refit,                              # selects the best model
        cv=cv,
        verbose=verbose,
        pre_dispatch="2*n_jobs",                  # controls memory consumption
        return_train_score=True                   # NB: could be False for speedup
    )
    return grid_search
예제 #9
0
    def _get_estimators(self):
        """Gets the estimators for forecast one-by-one.

        If the given parameters indicate that multiple estimators are need
        for the forecast one-by-one algorithm, these estimators with proper
        parameters are initialized.

        Sets ``self.estimator_class``, ``self.estimators``, ``self.pred_indices``
        and ``self.estimator_map_list``.
        """
        # Only estimators in ``ONEBYONE_ESTIMATORS`` supports forecast one-by-one.
        if self.estimator not in ONEBYONE_ESTIMATORS:
            raise ValueError(
                f"Estimator {self.estimator} does not support forecast"
                f" one-by-one.")

        self.estimator_class = ONEBYONE_ESTIMATORS[self.estimator]["class"]
        if self.estimator_params is None:
            self.estimator_params = {}

        # Sets estimator base parameters, so the prediction confidence intervals can be pulled.
        if "score_func" not in self.estimator_params:
            self.estimator_params["score_func"] = self.score_func
        if "coverage" not in self.estimator_params:
            self.estimator_params["coverage"] = self.coverage
        if "null_model_params" not in self.estimator_params:
            self.estimator_params["null_model_params"] = self.null_model_params

        # Checks if any provided parameters depend on forecast horizon.
        params_depending_on_horizon = ONEBYONE_ESTIMATORS[
            self.estimator]["params_depending_on_horizon"]
        depending_on_horizon = False
        if params_depending_on_horizon is not None:
            for param, values in params_depending_on_horizon.items():
                if param in self.estimator_params:
                    input_value = self.estimator_params.get(param)
                    if input_value in values:
                        depending_on_horizon = True
        if not depending_on_horizon:
            log_message(
                message="No parameters depending on forecast horizon found. "
                "Forecast one-by-one is not activated.",
                level=LoggingLevelEnum.INFO)

        # Checks if forecast horizon is a parameter in ``estimator_params``.
        # Forecast horizon should be different for different models.
        # If forecast horizon is a parameter, it need to be removed.
        # It will be added back differently for each estimator.
        forecast_horizon_param = ONEBYONE_ESTIMATORS[
            self.estimator]["forecast_horizon_param"]
        if depending_on_horizon:
            if forecast_horizon_param in self.estimator_params:
                del self.estimator_params[forecast_horizon_param]

        # Initializes estimator instances.
        if depending_on_horizon and self.estimator_map is not False:
            if self.estimator_map is None or self.estimator_map is True:
                self.estimator_map_list = [1] * self.forecast_horizon
            elif isinstance(self.estimator_map, int):
                estimator_map = [
                    self.estimator_map
                    for _ in range(self.forecast_horizon // self.estimator_map)
                ]
                if self.forecast_horizon % self.estimator_map:
                    estimator_map.append(self.forecast_horizon %
                                         self.estimator_map)
                self.estimator_map_list = estimator_map
            else:
                if sum(self.estimator_map) != self.forecast_horizon:
                    raise ValueError(
                        "Sum of forecast one by one estimator map must equal to forecast horizon."
                    )
                self.estimator_map_list = deepcopy(self.estimator_map)
            self.estimators = []
            self.pred_indices = [0]
            current_horizon = 0
            for i in self.estimator_map_list:
                current_horizon += i
                self.estimator_params[forecast_horizon_param] = current_horizon
                self.estimators.append(
                    deepcopy(self.estimator_class(**self.estimator_params)))
                self.pred_indices.append(current_horizon)
        else:
            self.estimator_map_list = [self.forecast_horizon]
            if forecast_horizon_param is not None:
                self.estimator_params[
                    forecast_horizon_param] = self.forecast_horizon
            self.estimators = [
                deepcopy(self.estimator_class(**self.estimator_params))
            ]
예제 #10
0
    def __get_template_class(
            self,
            config: Optional[ForecastConfig] = None
    ) -> Type[TemplateInterface]:
        """Extracts template class (e.g. `SimpleSilverkiteTemplate`) from the config.

        Parameters
        ----------
        config : :class:`~greykite.framework.templates.model_templates.ForecastConfig` or None
            Config object for template class to use.
            See :class:`~greykite.framework.templates.model_templates.ForecastConfig`.

        Returns
        -------
        template_class : Type[`~greykite.framework.templates.template_interface.TemplateInterface`]
            An implementation of `~greykite.framework.templates.template_interface.TemplateInterface`.
        """
        config = self.__get_config_with_default_model_template_and_components(
            config)

        if isinstance(config.model_template, list):
            # Parses `config.model_template` to extract the template class, with validation.
            # Handles a list of model templates.
            template_classes = [
                self.__get_template_class(config=ForecastConfig(
                    model_template=mt)) for mt in config.model_template
            ]
            for tc in template_classes:
                if tc != template_classes[0]:
                    raise ValueError(
                        "All model templates must use the same template class. "
                        f"Found {template_classes}")
            template_class = template_classes[0]
            if not template_class().allow_model_template_list:
                raise ValueError(
                    f"The template class {template_class} does not allow `model_template` to be a list. "
                    f"Pass a string instead.")
        else:
            # Handles other situations (string, data class).
            try:
                # Tries to look up in `self.model_template_enum`.
                template_class = self.model_template_enum[
                    config.model_template].value.template_class
            except (KeyError, TypeError):
                # Template is not found in the enum.
                # NB: The logic in this clause is written for the default `self.model_template_enum`,
                #   which contains only one template class that is a subclass of SimpleSilverkiteTemplate.
                #   If a custom `self.model_template_enum` is provided it may be useful to override this logic.
                valid_names = ", ".join(
                    self.model_template_enum.__dict__["_member_names_"])
                # Checks if template enum has a template class that supports generic naming
                #   i.e. a subclass of `SimpleSilverkiteTemplate`.
                subclass_simple_silverkite = [
                    mte for mte in self.model_template_enum if issubclass(
                        mte.value.template_class, SimpleSilverkiteTemplate)
                ]
                if len(subclass_simple_silverkite) > 0:
                    try:
                        log_message(
                            f"Model template {config.model_template} is not found in the template enum. "
                            f"Checking if model template is suitable for `SimpleSilverkiteTemplate`.",
                            LoggingLevelEnum.DEBUG)
                        SimpleSilverkiteTemplate().check_template_type(
                            config.model_template)
                        possible_template_classes = unique_elements_in_list([
                            mte.value.template_class
                            for mte in subclass_simple_silverkite
                        ])
                        if len(possible_template_classes) > 1:
                            log_message(
                                f"Multiple template classes could be used for the model "
                                f"template {config.model_template}: {possible_template_classes}",
                                LoggingLevelEnum.DEBUG)
                        # arbitrarily take a class that supports generic naming
                        template_class = subclass_simple_silverkite[
                            0].value.template_class
                        log_message(
                            f"Using template class {template_class} for the model "
                            f"template {config.model_template}",
                            LoggingLevelEnum.DEBUG)
                    except ValueError:
                        raise ValueError(
                            f"Model Template '{config.model_template}' is not recognized! Must be one of: {valid_names}"
                            " or satisfy the `SimpleSilverkiteTemplate` rules."
                        )
                else:
                    raise ValueError(
                        f"Model Template '{config.model_template}' is not recognized! Must be one of: {valid_names}."
                    )

        # Validates `model_components_param` compatibility with the template
        if not template_class(
        ).allow_model_components_param_list and isinstance(
                config.model_components_param, list):
            raise ValueError(
                f"Model template {config.model_template} does not support a list of `ModelComponentsParam`."
            )

        return template_class
예제 #11
0
    def pipeline_wrapper(
            # The arguments to this wrapper must be identical to forecast_pipeline() function.
            # We don't use **kwargs
            # because it's easier to check parameters directly.
            # input
            df: pd.DataFrame,
            time_col=TIME_COL,
            value_col=VALUE_COL,
            date_format=None,
            tz=None,
            freq=None,
            train_end_date=None,
            anomaly_info=None,
            # model
            pipeline=None,
            regressor_cols=None,
            lagged_regressor_cols=None,
            estimator=SimpleSilverkiteEstimator(),
            hyperparameter_grid=None,
            hyperparameter_budget=None,
            n_jobs=COMPUTATION_N_JOBS,
            verbose=1,
            # forecast
            forecast_horizon=None,
            coverage=0.95,
            test_horizon=None,
            periods_between_train_test=None,
            agg_periods=None,
            agg_func=None,
            # evaluation
            score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
            score_func_greater_is_better=False,
            cv_report_metrics=None,
            null_model_params=None,
            relative_error_tolerance=None,
            # CV
            cv_horizon=None,
            cv_min_train_periods=None,
            cv_expanding_window=False,
            cv_use_most_recent_splits=False,
            cv_periods_between_splits=None,
            cv_periods_between_train_test=0,
            cv_max_splits=3):
        if coverage is not None and (coverage < 0 or coverage > 1):
            raise ValueError(f"coverage must be between 0 and 1, found {coverage}")
        if relative_error_tolerance is not None and relative_error_tolerance < 0:
            raise ValueError(f"relative_error_tolerance must non-negative, found {relative_error_tolerance}")

        # default values for forecast horizon, test, and cross-validation parameters
        period = min_gap_in_seconds(df=df, time_col=time_col)
        num_observations = df.shape[0]
        default_time_params = get_default_time_parameters(
            period=period,
            num_observations=num_observations,
            forecast_horizon=forecast_horizon,
            test_horizon=test_horizon,
            periods_between_train_test=periods_between_train_test,
            cv_horizon=cv_horizon,
            cv_min_train_periods=cv_min_train_periods,
            cv_periods_between_train_test=cv_periods_between_train_test)
        forecast_horizon = default_time_params.get("forecast_horizon")
        test_horizon = default_time_params.get("test_horizon")
        periods_between_train_test = default_time_params.get("periods_between_train_test")
        cv_horizon = default_time_params.get("cv_horizon")
        cv_min_train_periods = default_time_params.get("cv_min_train_periods")
        cv_periods_between_train_test = default_time_params.get("cv_periods_between_train_test")

        # ensures the values are integers in the proper domain
        if hyperparameter_budget is not None:
            hyperparameter_budget = get_integer(
                hyperparameter_budget,
                "hyperparameter_budget",
                min_value=1)

        if (cv_horizon == 0 or cv_max_splits == 0) and test_horizon == 0:
            raise ValueError("Either CV or backtest must be enabled."
                             " Set cv_horizon and cv_max_splits to nonzero values to enable CV."
                             " Set test_horizon to nonzero value to enable backtest."
                             " It's important to check model"
                             " performance on historical data.")

        if test_horizon == 0:
            warnings.warn("No data selected for test (test_horizon=0). "
                          "It is important to check out of sample performance")

        # checks horizon against data size
        if num_observations < forecast_horizon * 2:
            warnings.warn(f"Not enough training data to forecast the full forecast_horizon."
                          " Exercise extra caution with"
                          f" forecasted values after {num_observations // 2} periods.")

        if test_horizon > num_observations:
            raise ValueError(f"test_horizon ({test_horizon}) is too large."
                             " Must be less than the number "
                             f"of input data points: {num_observations})")

        if test_horizon > forecast_horizon:
            warnings.warn(f"test_horizon should never be larger than forecast_horizon.")

        if test_horizon > num_observations // 3:
            warnings.warn(f"test_horizon should be <= than 1/3 of the data set size to allow enough data to train"
                          f" a backtest model. Consider reducing to {num_observations // 3}. If this is smaller"
                          f" than the forecast_horizon, you will need to make a trade-off between setting"
                          f" test_horizon=forecast_horizon and having enough data left over to properly"
                          f" train a realistic backtest model.")

        log_message(f"forecast_horizon: {forecast_horizon}", LoggingLevelEnum.INFO)
        log_message(f"test_horizon: {test_horizon}", LoggingLevelEnum.INFO)
        log_message(f"cv_horizon: {cv_horizon}", LoggingLevelEnum.INFO)

        return pipeline_function(
            df,
            time_col=time_col,
            value_col=value_col,
            date_format=date_format,
            tz=tz,
            freq=freq,
            train_end_date=train_end_date,
            anomaly_info=anomaly_info,
            pipeline=pipeline,
            regressor_cols=regressor_cols,
            lagged_regressor_cols=lagged_regressor_cols,
            estimator=estimator,
            hyperparameter_grid=hyperparameter_grid,
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            forecast_horizon=forecast_horizon,
            coverage=coverage,
            test_horizon=test_horizon,
            periods_between_train_test=periods_between_train_test,
            agg_periods=agg_periods,
            agg_func=agg_func,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            null_model_params=null_model_params,
            relative_error_tolerance=relative_error_tolerance,
            cv_horizon=cv_horizon,
            cv_min_train_periods=cv_min_train_periods,
            cv_expanding_window=cv_expanding_window,
            cv_use_most_recent_splits=cv_use_most_recent_splits,
            cv_periods_between_splits=cv_periods_between_splits,
            cv_periods_between_train_test=cv_periods_between_train_test,
            cv_max_splits=cv_max_splits
        )
예제 #12
0
def forecast_pipeline(
        # input
        df: pd.DataFrame,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        date_format=None,
        tz=None,
        freq=None,
        train_end_date=None,
        anomaly_info=None,
        # model
        pipeline=None,
        regressor_cols=None,
        lagged_regressor_cols=None,
        estimator=SimpleSilverkiteEstimator(),
        hyperparameter_grid=None,
        hyperparameter_budget=None,
        n_jobs=COMPUTATION_N_JOBS,
        verbose=1,
        # forecast
        forecast_horizon=None,
        coverage=0.95,
        test_horizon=None,
        periods_between_train_test=None,
        agg_periods=None,
        agg_func=None,
        # evaluation
        score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name,
        score_func_greater_is_better=False,
        cv_report_metrics=CV_REPORT_METRICS_ALL,
        null_model_params=None,
        relative_error_tolerance=None,
        # CV
        cv_horizon=None,
        cv_min_train_periods=None,
        cv_expanding_window=False,
        cv_use_most_recent_splits=False,
        cv_periods_between_splits=None,
        cv_periods_between_train_test=None,
        cv_max_splits=3):
    """Computation pipeline for end-to-end forecasting.

    Trains a forecast model end-to-end:

        1. checks input data
        2. runs cross-validation to select optimal hyperparameters e.g. best model
        3. evaluates best model on test set
        4. provides forecast of best model (re-trained on all data) into the future

    Returns forecasts with methods to plot and see diagnostics.
    Also returns the fitted pipeline and CV results.

    Provides a high degree of customization over training and evaluation parameters:

        1. model
        2. cross validation
        3. evaluation
        4. forecast horizon

    See test cases for examples.

    Parameters
    ----------
    df : `pandas.DataFrame`
        Timeseries data to forecast.
        Contains columns [`time_col`, `value_col`], and optional regressor columns
        Regressor columns should include future values for prediction

    time_col : `str`, default TIME_COL in constants.py
        name of timestamp column in df

    value_col : `str`, default VALUE_COL in constants.py
        name of value column in df (the values to forecast)

    date_format : `str` or None, default None
        strftime format to parse time column, eg ``%m/%d/%Y``.
        Note that ``%f`` will parse all the way up to nanoseconds.
        If None (recommended), inferred by `pandas.to_datetime`.

    tz : `str` or None, default None
        Passed to `pandas.tz_localize` to localize the timestamp

    freq : `str` or None, default None
        Frequency of input data. Used to generate future dates for prediction.
        Frequency strings can have multiples, e.g. '5H'.
        See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
        for a list of frequency aliases.
        If None, inferred by `pandas.infer_freq`.
        Provide this parameter if ``df`` has missing timepoints.

    train_end_date : `datetime.datetime`, optional, default None
        Last date to use for fitting the model. Forecasts are generated after this date.
        If None, it is set to the last date with a non-null value in
        ``value_col`` of ``df``.

    anomaly_info : `dict` or `list` [`dict`] or None, default None
        Anomaly adjustment info. Anomalies in ``df``
        are corrected before any forecasting is done.

        If None, no adjustments are made.

        A dictionary containing the parameters to
        `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`.
        See that function for details.
        The possible keys are:

            ``"value_col"`` : `str`
                The name of the column in ``df`` to adjust. You may adjust the value
                to forecast as well as any numeric regressors.
            ``"anomaly_df"`` : `pandas.DataFrame`
                Adjustments to correct the anomalies.
            ``"start_date_col"``: `str`, default START_DATE_COL
                Start date column in ``anomaly_df``.
            ``"end_date_col"``: `str`, default END_DATE_COL
                End date column in ``anomaly_df``.
            ``"adjustment_delta_col"``: `str` or None, default None
                Impact column in ``anomaly_df``.
            ``"filter_by_dict"``: `dict` or None, default None
                Used to filter ``anomaly_df`` to the relevant anomalies for
                the ``value_col`` in this dictionary.
                Key specifies the column name, value specifies the filter value.
            ``"filter_by_value_col""``: `str` or None, default None
                Adds ``{filter_by_value_col: value_col}`` to ``filter_by_dict``
                if not None, for the ``value_col`` in this dictionary.
            ``"adjustment_method"`` : `str` ("add" or "subtract"), default "add"
                How to make the adjustment, if ``adjustment_delta_col`` is provided.

        Accepts a list of such dictionaries to adjust multiple columns in ``df``.

    pipeline : `sklearn.pipeline.Pipeline` or None, default None
        Pipeline to fit. The final named step must be called "estimator".
        If None, will use the default Pipeline from
        `~greykite.framework.pipeline.utils.get_basic_pipeline`.

    regressor_cols : `list` [`str`] or None, default None
        A list of regressor columns used in the training and prediction DataFrames.
        It should contain only the regressors that are being used in the grid search.
        If None, no regressor columns are used.
        Regressor columns that are unavailable in ``df`` are dropped.

    lagged_regressor_cols : `list` [`str`] or None, default None
        A list of additional columns needed for lagged regressors in the training and prediction DataFrames.
        This list can have overlap with ``regressor_cols``.
        If None, no additional columns are added to the DataFrame.
        Lagged regressor columns that are unavailable in ``df`` are dropped.

    estimator : instance of an estimator that implements `greykite.algo.models.base_forecast_estimator.BaseForecastEstimator`
        Estimator to use as the final step in the pipeline.
        Ignored if ``pipeline`` is provided.

    forecast_horizon : `int` or None, default None
        Number of periods to forecast into the future. Must be > 0.
        If None, default is determined from input data frequency

    coverage : `float` or None, default=0.95
        Intended coverage of the prediction bands (0.0 to 1.0)
        If None, the upper/lower predictions are not returned
        Ignored if `pipeline` is provided. Uses coverage of the ``pipeline`` estimator instead.

    test_horizon : `int` or None, default None
        Numbers of periods held back from end of df for test.
        The rest is used for cross validation.
        If None, default is forecast_horizon. Set to 0 to skip backtest.

    periods_between_train_test : `int` or None, default None
        Number of periods for the gap between train and test data.
        If None, default is 0.

    agg_periods : `int` or None, default None
        Number of periods to aggregate before evaluation.

        Model is fit and forecasted on the dataset's original frequency.

        Before evaluation, the actual and forecasted values are aggregated,
        using rolling windows of size ``agg_periods`` and the function
        ``agg_func``. (e.g. if the dataset is hourly, use ``agg_periods=24, agg_func=np.sum``,
        to evaluate performance on the daily totals).

        If None, does not aggregate before evaluation.

        Currently, this is only used when calculating CV metrics and
        the R2_null_model_score metric in backtest/forecast. No pre-aggregation
        is applied for the other backtest/forecast evaluation metrics.

    agg_func : callable or None, default None
        Takes an array and returns a number, e.g. np.max, np.sum.

        Defines how to aggregate rolling windows of actual and predicted values
        before evaluation.

        Ignored if ``agg_periods`` is None.

        Currently, this is only used when calculating CV metrics and
        the R2_null_model_score metric in backtest/forecast. No pre-aggregation
        is applied for the other backtest/forecast evaluation metrics.

    score_func : `str` or callable, default ``EvaluationMetricEnum.MeanAbsolutePercentError.name``
        Score function used to select optimal model in CV.
        If a callable, takes arrays ``y_true``, ``y_pred`` and returns a float.
        If a string, must be either a
        `~greykite.common.evaluation.EvaluationMetricEnum` member name
        or `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

    score_func_greater_is_better : `bool`, default False
        True if ``score_func`` is a score function, meaning higher is better,
        and False if it is a loss function, meaning lower is better.
        Must be provided if ``score_func`` is a callable (custom function).
        Ignored if ``score_func`` is a string, because the direction is known.

    cv_report_metrics : `str`, or `list` [`str`], or None, default `~greykite.common.constants.CV_REPORT_METRICS_ALL`
        Additional metrics to compute during CV, besides the one specified by ``score_func``.

            - If the string constant `greykite.framework.constants.CV_REPORT_METRICS_ALL`,
              computes all metrics in ``EvaluationMetricEnum``. Also computes
              ``FRACTION_OUTSIDE_TOLERANCE`` if ``relative_error_tolerance`` is not None.
              The results are reported by the short name (``.get_metric_name()``) for ``EvaluationMetricEnum``
              members and ``FRACTION_OUTSIDE_TOLERANCE_NAME`` for ``FRACTION_OUTSIDE_TOLERANCE``.
              These names appear in the keys of ``forecast_result.grid_search.cv_results_``
              returned by this function.
            - If a list of strings, each of the listed metrics is computed. Valid strings are
              `~greykite.common.evaluation.EvaluationMetricEnum` member names
              and `~greykite.common.constants.FRACTION_OUTSIDE_TOLERANCE`.

              For example::

                ["MeanSquaredError", "MeanAbsoluteError", "MeanAbsolutePercentError", "MedianAbsolutePercentError", "FractionOutsideTolerance2"]

            - If None, no additional metrics are computed.

    null_model_params : `dict` or None, default None
        Defines baseline model to compute ``R2_null_model_score`` evaluation metric.
        ``R2_null_model_score`` is the improvement in the loss function relative
        to a null model. It can be used to evaluate model quality with respect to
        a simple baseline. For details, see
        `~greykite.common.evaluation.r2_null_model_score`.

        The null model is a `~sklearn.dummy.DummyRegressor`,
        which returns constant predictions.

        Valid keys are "strategy", "constant", "quantile".
        See `~sklearn.dummy.DummyRegressor`. For example::

            null_model_params = {
                "strategy": "mean",
            }
            null_model_params = {
                "strategy": "median",
            }
            null_model_params = {
                "strategy": "quantile",
                "quantile": 0.8,
            }
            null_model_params = {
                "strategy": "constant",
                "constant": 2.0,
            }

        If None, ``R2_null_model_score`` is not calculated.

        Note: CV model selection always optimizes ``score_func`, not
        the ``R2_null_model_score``.

    relative_error_tolerance : `float` or None, default None
        Threshold to compute the ``Outside Tolerance`` metric,
        defined as the fraction of forecasted values whose relative
        error is strictly greater than ``relative_error_tolerance``.
        For example, 0.05 allows for 5% relative error.
        If `None`, the metric is not computed.

    hyperparameter_grid : `dict`, `list` [`dict`] or None, default None
        Sets properties of the steps in the pipeline,
        and specifies combinations to search over.
        Should be valid input to `sklearn.model_selection.GridSearchCV` (param_grid)
        or `sklearn.model_selection.RandomizedSearchCV` (param_distributions).

        Prefix transform/estimator attributes by the name of the step in the pipeline.
        See details at: https://scikit-learn.org/stable/modules/compose.html#nested-parameters

        If None, uses the default pipeline parameters.

    hyperparameter_budget : `int` or None, default None
        Max number of hyperparameter sets to try within the ``hyperparameter_grid`` search space

        Runs a full grid search if ``hyperparameter_budget`` is sufficient to exhaust full
        ``hyperparameter_grid``, otherwise samples uniformly at random from the space.

        If None, uses defaults:

            * full grid search if all values are constant
            * 10 if any value is a distribution to sample from

    n_jobs : `int` or None, default `~greykite.framework.constants.COMPUTATION_N_JOBS`
        Number of jobs to run in parallel
        (the maximum number of concurrently running workers).
        ``-1`` uses all CPUs. ``-2`` uses all CPUs but one.
        ``None`` is treated as 1 unless in a `joblib.Parallel` backend context
        that specifies otherwise.

    verbose : `int`, default 1
        Verbosity level during CV.
        if > 0, prints number of fits
        if > 1, prints fit parameters, total score + fit time
        if > 2, prints train/test scores

    cv_horizon : `int` or None, default None
        Number of periods in each CV test set
        If None, default is ``forecast_horizon``.
        Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV.

    cv_min_train_periods : `int` or None, default None
        Minimum number of periods for training each CV fold.
        If cv_expanding_window is False, every training period is this size
        If None, default is 2 * ``cv_horizon``

    cv_expanding_window : `bool`, default False
        If True, training window for each CV split is fixed to the first available date.
        Otherwise, train start date is sliding, determined by ``cv_min_train_periods``.

    cv_use_most_recent_splits: `bool`, default False
        If True, splits from the end of the dataset are used.
        Else a sampling strategy is applied. Check
        `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit._sample_splits`
        for details.

    cv_periods_between_splits : `int` or None, default None
        Number of periods to slide the test window between CV splits
        If None, default is ``cv_horizon``

    cv_periods_between_train_test : `int` or None, default None
        Number of periods for the gap between train and test in a CV split.
        If None, default is ``periods_between_train_test``.

    cv_max_splits : `int` or None, default 3
        Maximum number of CV splits.
        Given the above configuration, samples up to max_splits train/test splits,
        preferring splits toward the end of available data. If None, uses all splits.
        Set either ``cv_horizon`` or ``cv_max_splits`` to 0 to skip CV.

    Returns
    -------
    forecast_result : :class:`~greykite.framework.pipeline.pipeline.ForecastResult`
        Forecast result. See :class:`~greykite.framework.pipeline.pipeline.ForecastResult`
        for details.

            * If ``cv_horizon=0``, ``forecast_result.grid_search.best_estimator_``
              and ``forecast_result.grid_search.best_params_`` attributes are defined
              according to the provided single set of parameters. There must be a single
              set of parameters to skip cross-validation.
            * If ``test_horizon=0``, ``forecast_result.backtest`` is None.
    """
    if hyperparameter_grid is None or hyperparameter_grid == []:
        hyperparameter_grid = {}
    # When hyperparameter_grid is a singleton list, unlist it
    if isinstance(hyperparameter_grid, list) and len(hyperparameter_grid) == 1:
        hyperparameter_grid = hyperparameter_grid[0]

    # Loads full dataset
    ts = UnivariateTimeSeries()
    ts.load_data(
        df=df,
        time_col=time_col,
        value_col=value_col,
        freq=freq,
        date_format=date_format,
        tz=tz,
        train_end_date=train_end_date,
        regressor_cols=regressor_cols,
        lagged_regressor_cols=lagged_regressor_cols,
        anomaly_info=anomaly_info)

    # Splits data into training and test sets. ts.df uses standardized column names
    if test_horizon == 0:
        train_df = ts.fit_df
        train_y = ts.fit_y
        test_df = pd.DataFrame(columns=list(df.columns))
    else:
        # Make sure to refit best_pipeline appropriately
        train_df, test_df, train_y, test_y = train_test_split(
            ts.fit_df,
            ts.fit_y,
            train_size=ts.fit_df.shape[0] - test_horizon - periods_between_train_test,
            test_size=test_horizon + periods_between_train_test,
            shuffle=False)  # this is important since this is timeseries forecasting!
    log_message(f"Train size: {train_df.shape[0]}. Test size: {test_df.shape[0]}", LoggingLevelEnum.INFO)

    # Defines default training pipeline
    if pipeline is None:
        pipeline = get_basic_pipeline(
            estimator=estimator,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance,
            coverage=coverage,
            null_model_params=null_model_params,
            regressor_cols=ts.regressor_cols,
            lagged_regressor_cols=ts.lagged_regressor_cols)

    # Searches for the best parameters, and refits model with selected parameters on the entire training set
    if cv_horizon == 0 or cv_max_splits == 0:
        # No cross-validation. Only one set of hyperparameters is allowed.
        try:
            if len(ParameterGrid(hyperparameter_grid)) > 1:
                raise ValueError(
                    "CV is required to identify the best model because there are multiple options "
                    "in `hyperparameter_grid`. Either provide a single option or set `cv_horizon` and `cv_max_splits` "
                    "to nonzero values.")
        except TypeError:  # Parameter value is not iterable
            raise ValueError(
                "CV is required to identify the best model because `hyperparameter_grid` contains "
                "a distribution. Either remove the distribution or set `cv_horizon` and `cv_max_splits` "
                "to nonzero values.")

        # Fits model to entire train set. Params must be set manually since it's not done by grid search
        params = {k: v[0] for k, v in hyperparameter_grid.items()}  # unpack lists, `v` is a singleton list with the parameter value
        best_estimator = pipeline.set_params(**params).fit(train_df, train_y)

        # Wraps this model in a dummy RandomizedSearchCV object to return the backtest model
        grid_search = get_hyperparameter_searcher(
            hyperparameter_grid=hyperparameter_grid,
            model=pipeline,
            cv=None,  # no cross-validation
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance)
        # Sets relevant attributes. Others are undefined (cv_results_, best_score_, best_index_, scorer_, refit_time_)
        grid_search.best_estimator_ = best_estimator
        grid_search.best_params_ = params
        grid_search.n_splits_ = 0
    else:
        # Defines cross-validation splitter
        cv = RollingTimeSeriesSplit(
            forecast_horizon=cv_horizon,
            min_train_periods=cv_min_train_periods,
            expanding_window=cv_expanding_window,
            use_most_recent_splits=cv_use_most_recent_splits,
            periods_between_splits=cv_periods_between_splits,
            periods_between_train_test=cv_periods_between_train_test,
            max_splits=cv_max_splits)

        # Defines grid search approach for CV
        grid_search = get_hyperparameter_searcher(
            hyperparameter_grid=hyperparameter_grid,
            model=pipeline,
            cv=cv,
            hyperparameter_budget=hyperparameter_budget,
            n_jobs=n_jobs,
            verbose=verbose,
            score_func=score_func,
            score_func_greater_is_better=score_func_greater_is_better,
            cv_report_metrics=cv_report_metrics,
            agg_periods=agg_periods,
            agg_func=agg_func,
            relative_error_tolerance=relative_error_tolerance)
        grid_search.fit(train_df, train_y)
        best_estimator = grid_search.best_estimator_

    # Evaluates historical performance, fits model to all data (train+test)
    if test_horizon > 0:
        backtest_train_end_date = train_df[TIME_COL].max()
        # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON"
        backtest_test_start_date = pd.date_range(
            start=backtest_train_end_date,
            periods=periods_between_train_test + 2,  # Adds 2 as start parameter is inclusive
            freq=ts.freq)[-1]
        backtest = get_forecast(
            df=ts.fit_df,  # Backtest needs to happen on fit_df, not on the entire df
            trained_model=best_estimator,
            train_end_date=backtest_train_end_date,
            test_start_date=backtest_test_start_date,
            forecast_horizon=test_horizon,
            xlabel=time_col,
            ylabel=value_col,
            relative_error_tolerance=relative_error_tolerance)
        best_pipeline = clone(best_estimator)  # Copies optimal parameters
        best_pipeline.fit(ts.fit_df, ts.y)  # Refits this model on entire training dataset
    else:
        backtest = None  # Backtest training metrics are the same as forecast training metrics
        best_pipeline = best_estimator  # best_model is already fit to all data

    # Makes future predictions
    periods = forecast_horizon + periods_between_train_test
    future_df = ts.make_future_dataframe(
        periods=periods,
        include_history=True)

    forecast_train_end_date = ts.train_end_date
    # Uses pd.date_range because pd.Timedelta does not work for complicated frequencies e.g. "W-MON"
    forecast_test_start_date = pd.date_range(
        start=forecast_train_end_date,
        periods=periods_between_train_test + 2,  # Adds 2 as start parameter is inclusive
        freq=ts.freq)[-1]
    forecast = get_forecast(
        df=future_df,
        trained_model=best_pipeline,
        train_end_date=forecast_train_end_date,
        test_start_date=forecast_test_start_date,
        forecast_horizon=forecast_horizon,
        xlabel=time_col,
        ylabel=value_col,
        relative_error_tolerance=relative_error_tolerance)

    result = ForecastResult(
        timeseries=ts,
        grid_search=grid_search,
        model=best_pipeline,
        backtest=backtest,
        forecast=forecast
    )
    return result
예제 #13
0
def forecast_pipeline_rolling_evaluation(pipeline_params: Dict,
                                         tscv: RollingTimeSeriesSplit):
    """Runs ``forecast_pipeline`` on a rolling window basis.

    Parameters
    ----------
    pipeline_params : `Dict`
        A dictionary containing the input to the
        :func:`~greykite.framework.pipeline.pipeline.forecast_pipeline`.
    tscv : `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit`
        Cross-validation object that determines the rolling window evaluation.
        See :class:`~greykite.sklearn.cross_validation.RollingTimeSeriesSplit` for details.

    Returns
    -------
    rolling_evaluation : `dict`
        Stores benchmarking results for each split, e.g.
        split_0 contains result for first split, split_1 contains result for second split and so on.
        Number of splits is determined by the input parameters.
        Every split is a dictionary with keys "runtime_sec" and "pipeline_result".
    """
    if pipeline_params["forecast_horizon"] != tscv.forecast_horizon:
        raise ValueError(
            "Forecast horizon in 'pipeline_params' does not match that of the 'tscv'."
        )

    if pipeline_params[
            "periods_between_train_test"] != tscv.periods_between_train_test:
        raise ValueError(
            "'periods_between_train_test' in 'pipeline_params' does not match that of the 'tscv'."
        )

    df = pipeline_params["df"]
    time_col = pipeline_params.get("time_col", TIME_COL)
    date_format = pipeline_params.get("date_format")
    # Disables backtest. For rolling evaluation we know the actual values in forecast period.
    # So out of sample performance can be calculated using pipeline_result.forecast
    pipeline_params["test_horizon"] = 0

    rolling_evaluation = {}
    with tqdm(list(tscv.split(X=df)), ncols=800, leave=True) as progress_bar:
        for (split_num, (train, test)) in enumerate(progress_bar):
            # Description will be displayed on the left of progress bar
            progress_bar.set_description(f"Split '{split_num}' ")
            train_end_date = pd.to_datetime(df.iloc[train[-1]][time_col],
                                            format=date_format,
                                            infer_datetime_format=True)
            pipeline_params["train_end_date"] = train_end_date

            start_time = timeit.default_timer()
            pipeline_result = forecast_pipeline(**pipeline_params)
            runtime = timeit.default_timer() - start_time

            pipeline_output = dict(runtime_sec=round(runtime, 3),
                                   pipeline_result=pipeline_result)
            rolling_evaluation[f"split_{split_num}"] = pipeline_output

            log_message(f"Completed evaluation for split {split_num}.",
                        LoggingLevelEnum.DEBUG)

    return rolling_evaluation
예제 #14
0
    def predict(self, X, y=None):
        """Creates forecast for dates specified in X

        The forecast one-by-one is supposed to forecast for the specified forecast horizon
        with the specified estimator-horizon mapping. If the size of ``X`` is different from
        forecast horizon, only the last model (trained with the longest forecast horizon)
        will be used.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Input timeseries with timestamp column and any additional regressors.
            Timestamps are the dates for prediction.
            Value column, if provided in X, is ignored.
        y : ignored

        Returns
        -------
        predictions : `pandas.DataFrame`
            Forecasted values for the dates in X. Columns:

                - TIME_COL dates
                - PREDICTED_COL predictions
                - PREDICTED_LOWER_COL lower bound of predictions, optional
                - PREDICTED_UPPER_COL upper bound of predictions, optional
                - [other columns], optional

            ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present
            if ``self.coverage`` is not None.
        """
        # Returns the cached result if applicable
        cached_predictions = super().predict(X=X)
        if cached_predictions is not None:
            return cached_predictions

        # Only one model.
        if len(self.estimators) == 1:
            return self.estimators[0].predict(X=X)

        # Forecast one-by-one is forecast-horizon-sensitive.
        # Checks future forecast horizon length to decide
        # how to make forecasts.
        is_future = pd.to_datetime(X[self.time_col_]) > self.train_end_date
        x_future = X.loc[is_future]

        # If the future prediction length is different from the forecast horizon,
        # use the last model only.
        if len(x_future) != self.forecast_horizon:
            log_message(
                message=f"The future x length is {len(x_future)}, "
                f"which doesn't match the model forecast horizon {self.forecast_horizon}, "
                f"using only the model with the longest forecast horizon for prediction.",
                level=LoggingLevelEnum.WARNING)
            return self.estimators[-1].predict(X)

        # From now on assume X is exactly the forecast horizon.
        # Makes predictions according to the estimator map.
        predictions = [
            estimator.predict(
                x_future.iloc[self.pred_indices[i]:self.pred_indices[i + 1]])
            for i, estimator in enumerate(self.estimators)
        ]
        # The past df is always forecasted with the last estimator.
        if not is_future.all():
            past_prediction = self.estimators[-1].predict(X.loc[~is_future])
            predictions = [past_prediction] + predictions

        return pd.concat(predictions)
예제 #15
0
def test_log_message():
    with LogCapture(LOGGER_NAME) as log_capture:
        log_message("Test log message.", LoggingLevelEnum.CRITICAL)
        log_capture.check((LOGGER_NAME, "CRITICAL", "Test log message."))
예제 #16
0
 def summary(self):
     log_message("Benchmark summary is not implemented yet.",
                 LoggingLevelEnum.WARNING)
예제 #17
0
def flexible_grouping_evaluation(df,
                                 map_func_dict=None,
                                 groupby_col=None,
                                 agg_kwargs=None,
                                 extend_col_names=True,
                                 unpack_list=True,
                                 list_names_dict=None):
    """Flexible aggregation. Generates additional columns for evaluation via
    ``map_func_dict``, groups by ``groupby_col``, then aggregates according
    to ``agg_kwargs``.

    This function calls `pandas.DataFrame.apply` and
    `pandas.core.groupby.DataFrameGroupBy.agg` internally.

    Parameters
    ----------
    df : `pandas.DataFrame`
        DataFrame to transform / aggregate
    map_func_dict : `dict` [`str`, `callable`] or None, default None
        Row-wise transformation functions to create new columns.
        If None, no new columns are added.

        key: new column name
        value: row-wise function to apply to ``df`` to generate the column value.
               Signature (row: `pandas.DataFrame`) -> transformed value: `float`.

        For example::

            map_func_dict = {
                "residual": lambda row: row["predicted"] - row["actual"],
                "squared_error": lambda row: (row["predicted"] - row["actual"])**2
            }

    groupby_col : `str` or None, default None
        Which column to group by.
        Can be in ``df`` or generated by ``map_func_dict``.
        If None, no grouping or aggregation is done.
    agg_kwargs : `dict` or None, default None
        Passed as keyword args to `pandas.core.groupby.DataFrameGroupBy.aggregate` after creating
        new columns and grouping by ``groupby_col``. Must be provided if ``groupby_col is not None``.
        To fully customize output column names, pass a dictionary as shown below.

        For example::

            # Example 1, named aggregation to explicitly name output columns.
            # Assume ``df`` contains ``abs_percent_err``, ``abs_err`` columns.
            # Output columns are "MedAPE", "MAPE", "MAE", etc. in a single level index.
            from functools import partial
            agg_kwargs = {
                # output column name: (column to aggregate, aggregation function)
                "MedAPE": pd.NamedAgg(column="abs_percent_err", aggfunc=np.nanmedian),
                "MAPE": pd.NamedAgg(column="abs_percent_err", aggfunc=np.nanmean),
                "MAE": pd.NamedAgg(column="abs_err", aggfunc=np.nanmean),
                "q95_abs_err": pd.NamedAgg(column="abs_err", aggfunc=partial(np.nanquantile, q=0.95)),
                "q05_abs_err": pd.NamedAgg(column="abs_err", aggfunc=partial(np.nanquantile, q=0.05)),
            }

            # Example 2, multi-level aggregation using `func` parameter
            # to `pandas.core.groupby.DataFrameGroupBy.aggregate`.
            # Assume ``df`` contains ``y1``, ``y2`` columns.
            agg_kwargs = {
                "func": {
                    "y1": [np.nanmedian, np.nanmean],
                    "y2": [np.nanmedian, np.nanmax],
                }
            }
            # `extend_col_names` controls the output column names
            extend_col_names = True  # output columns are "y1_nanmean", "y1_nanmedian", "y2_nanmean", "y2_nanmax"
            extend_col_names = False  # output columns are "nanmean", "nanmedian", "nanmean", "nanmax"

    extend_col_names : `bool` or None, default True
        How to flatten index after aggregation.
        In some cases, the column index after aggregation is a multi-index.
        This parameter controls how to flatten an index with 2 levels to 1 level.

            - If None, the index is not flattened.
            - If True, column name is a composite: ``{index0}_{index1}``
              Use this option if index1 is not unique.
            - If False, column name is simply ``{index1}``

        Ignored if the ColumnIndex after aggregation has only one level (e.g.
        if named aggregation is used in ``agg_kwargs``).

    unpack_list : `bool`, default True
        Whether to unpack (flatten) columns that contain list/tuple after aggregation,
        to create one column per element of the list/tuple.
        If True, ``list_names_dict`` can be used to rename the unpacked columns.

    list_names_dict : `dict` [`str`, `list` [`str`]] or None, default None
        If ``unpack_list`` is True, this dictionary can optionally be
        used to rename the unpacked columns.

            - Key = column name after aggregation, before upacking.
              E.g. ``{index0}_{index1}`` or ``{index1}`` depending on ``extend_col_names``.
            - Value = list of names to use for the unpacked columns. Length must match
              the length of the lists contained in the column.

        If a particular list/tuple column is not found in this dictionary, appends
        0, 1, 2, ..., n-1 to the original column name, where n = list length.

        For example, if the column contains a tuple of length 4 corresponding to
        quantiles 0.1, 0.25, 0.75, 0.9, then the following would be appropriate::

            aggfunc = lambda grp: partial(np.nanquantile, q=[0.1, 0.25, 0.75, 0.9])(grp).tolist()
            agg_kwargs = {
                "value_Q": pd.NamedAgg(column="value", aggfunc=aggfunc)
            }
            list_names_dict = {
                # the key is the name of the unpacked column
                "value_Q" : ["Q0.10", "Q0.25", "Q0.75", "Q0.90"]
            }
            # Output columns are "Q0.10", "Q0.25", "Q0.75", "Q0.90"

            # In this example, if list_names_dict=None, the default output column names
            # would be: "value_Q0", "value_Q1", "value_Q2", "value_Q3"

    Returns
    -------
    df_transformed : `pandas.DataFrame`
        df after transformation and optional aggregation.

        If ``groupby_col`` is None, returns ``df`` with additional columns as the keys in ``map_func_dict``.
        Otherwise, ``df`` is grouped by ``groupby_col`` and this becomes the index. Columns
        are determined by ``agg_kwargs`` and ``extend_col_names``.
    """
    if groupby_col and not agg_kwargs:
        raise ValueError(
            "Must specify `agg_kwargs` if grouping is requested via `groupby_col`."
        )
    if agg_kwargs and not groupby_col:
        log_message(
            f"`agg_kwargs` is ignored because `groupby_col` is None. "
            f"Specify `groupby_col` to allow aggregation.",
            LoggingLevelEnum.WARNING)

    df = df.copy()
    if map_func_dict is not None:
        for col_name, func in map_func_dict.items():
            df[col_name] = df.apply(func, axis=1)

    if groupby_col is not None:
        groups = df.groupby(groupby_col)
        with warnings.catch_warnings():
            # Ignores pandas FutureWarning. Use NamedAgg in pandas 0.25.+
            warnings.filterwarnings(
                "ignore",
                message="using a dict with renaming is deprecated",
                category=FutureWarning)
            df_transformed = groups.agg(**agg_kwargs)
        if extend_col_names is not None and df_transformed.columns.nlevels > 1:
            # Flattens multi-level column index
            if extend_col_names:
                # By concatenating names
                df_transformed.columns = [
                    "_".join(col).strip("_") for col in df_transformed.columns
                ]
            else:
                # By using level 1 names
                df_transformed.columns = list(
                    df_transformed.columns.get_level_values(1))
                if np.any(df_transformed.columns.duplicated()):
                    warnings.warn(
                        "Column names are not unique. Use `extend_col_names=True` "
                        "to uniquely identify every column.")
    else:
        # No grouping is requested
        df_transformed = df

    if unpack_list and df_transformed.shape[0] > 0:
        # Identifies the columns that contain list elements
        which_list_cols = df_transformed.iloc[0].apply(
            lambda x: isinstance(x, (list, tuple)))
        list_cols = list(which_list_cols[which_list_cols].index)
        for col in list_cols:
            if isinstance(df_transformed[col], pd.DataFrame):
                warnings.warn(
                    f"Skipping list unpacking for `{col}`. There are multiple columns "
                    f"with this name. Make sure column names are unique to enable unpacking."
                )
                continue
            # Unpacks the column, creating one column for each list entry
            list_df = pd.DataFrame(df_transformed[col].to_list())
            n_cols = list_df.shape[1]
            # Adds column names
            if list_names_dict is not None and col in list_names_dict:
                found_length = len(list_names_dict[col])
                if found_length != n_cols:
                    raise ValueError(
                        f"list_names_dict['{col}'] has length {found_length}, "
                        f"but there are {n_cols} columns to name. Example row(s):\n"
                        f"{list_df.head(2)}")
                list_df.columns = [
                    f"{list_names_dict.get(col)[i]}" for i in range(n_cols)
                ]
            else:
                list_df.columns = [f"{col}{i}" for i in range(n_cols)]
            # replaces original column with new ones
            list_df.index = df_transformed.index
            del df_transformed[col]
            df_transformed = pd.concat([df_transformed, list_df], axis=1)

        if list_names_dict:
            unused_names = sorted(
                list(set(list_names_dict.keys()) - set(list_cols)))
            if len(unused_names) > 0:
                warnings.warn(
                    "These names from `list_names_dict` are not used, because the "
                    "column (key) is not found in the dataframe after aggregation:\n"
                    f"{unused_names}.\nAvailable columns are:\n"
                    f"{list_cols}.")

    return df_transformed
예제 #18
0
 def summary(self):
     """Prints input parameters and DummyRegressor model parameters
     """
     log_message(self, LoggingLevelEnum.DEBUG)
     log_message(self.model, LoggingLevelEnum.DEBUG)
예제 #19
0
    def split(self, X, y=None, groups=None):
        """Generates indices to split data into training and test CV folds according to rolling
          window time series cross validation

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
            Must have `shape` method.

        y : array-like, shape (n_samples,), optional
            The target variable for supervised learning problems. Always ignored, exists for compatibility.

        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set. Always ignored, exists for compatibility.

        Yields
        ------
        train : `numpy.array`
            The training set indices for that split.

        test : `numpy.array`
            The testing set indices for that split.
        """
        num_samples = X.shape[0]
        indices = np.arange(num_samples)

        n_splits_without_capping = self.get_n_splits_without_capping(X=X)
        n_splits = self.get_n_splits(X=X)
        if n_splits_without_capping == 0:
            warnings.warn(
                "There are no CV splits under the requested settings. Decrease `forecast_horizon` and/or"
                " `min_train_periods`. Using default 90/10 CV split")
        elif n_splits == 1:
            warnings.warn("There is only one CV split")
        elif n_splits >= 10:
            warnings.warn(
                f"There is a high number of CV splits ({n_splits}). If training is slow, increase "
                f"`periods_between_splits` or `min_train_periods`, or decrease `max_splits`"
            )

        log_message(f"There are {n_splits} CV splits.", LoggingLevelEnum.INFO)

        if n_splits_without_capping == 0:  # uses default split
            default_split_ratio = 0.9
            train_samples = int(round(num_samples * default_split_ratio))
            yield indices[:train_samples], indices[train_samples:]
        else:  # determines which splits to keep so that up to max_splits are returned
            splits_to_keep = self._sample_splits(n_splits_without_capping)

            last_index = num_samples - 1
            test_end_index = self.__starting_test_index + self._get_offset(X=X)
            current_split_index = 0
            while test_end_index <= last_index:
                test_start_index = test_end_index - self.forecast_horizon + 1
                train_end_index = test_start_index - self.periods_between_train_test - 1
                train_start_index = 0 if self.expanding_window else train_end_index - self.min_train_periods + 1
                assert train_start_index >= 0  # guaranteed by n_splits > 0

                if current_split_index in splits_to_keep:
                    log_message(
                        f"CV split: Train {train_start_index} to {train_end_index}. "
                        f"Test {test_start_index} to {test_end_index}.",
                        LoggingLevelEnum.DEBUG)
                    yield indices[train_start_index:train_end_index +
                                  1], indices[test_start_index:test_end_index +
                                              1]

                test_end_index += self.periods_between_splits
                current_split_index += 1