예제 #1
0
def test_predict_score_df():
    """Tests _PredictScorerDF by checking whether it can
    properly score a DummyEstimator
    """
    periods = 20
    model = DummyEstimator()
    X = pd.DataFrame({
        TIME_COL: pd.date_range("2018-01-01", periods=periods, freq="D"),
        VALUE_COL: np.arange(periods)  # the first value is 0, so MAPE will divide by 0
    })
    model.fit(X)

    def method_caller(estimator, method, *args, **kwargs):
        """Call estimator with method and args and kwargs."""
        return getattr(estimator, method)(*args, **kwargs)

    with pytest.warns(Warning, match="Score is undefined for this split, setting to `np.nan`."):
        scorer = _PredictScorerDF(mean_absolute_percent_error, 1, {})
        score = scorer._score(method_caller, model, X, X[VALUE_COL])
        assert np.isnan(score)

        scorer = _PredictScorerDF(mean_absolute_percent_error, -1, {})
        score = scorer._score(method_caller, model, X, X[VALUE_COL])
        assert np.isnan(score)

    scorer = _PredictScorerDF(mean_absolute_error, -1, {})
    score = scorer._score(method_caller, model, X, X[VALUE_COL])
    model.predict(X)
    assert score == -5.0  # MAE of 9.5 vs [0, 1, 2, ..., 19]
예제 #2
0
def test_score():
    """Tests score function"""
    model = DummyEstimator(strategy="mean", score_func=mean_absolute_error)
    X = pd.DataFrame({
        TIME_COL:
        pd.date_range("2018-01-01", periods=3, freq="1D"),
        VALUE_COL: [2.0, 3.0, 4.0]
    })
    model.fit(X)  # prediction is 3.0

    y = pd.Series([1.0, 2.0, 3.0])

    assert model.score(X, y) == mean_absolute_error(y,
                                                    np.repeat(3.0, X.shape[0]))
예제 #3
0
def test_quantile_model():
    """Tests quantile model with custom column names"""
    model = DummyEstimator(strategy="quantile", quantile=0.8)

    X = pd.DataFrame({
        "time_name":
        pd.date_range("2018-01-01", periods=11, freq="D"),
        "value_name":
        np.arange(11)
    })

    model.fit(X, time_col="time_name", value_col="value_name")
    predicted = model.predict(X)

    expected = pd.DataFrame({
        TIME_COL: X["time_name"],
        PREDICTED_COL: np.repeat(8.0, X.shape[0])
    })

    assert predicted.equals(expected)
예제 #4
0
def test_constant_model():
    """Tests constant model"""
    constant = 1.0
    model = DummyEstimator(strategy="constant", constant=constant)

    X = pd.DataFrame({
        TIME_COL:
        pd.date_range("2018-01-01", periods=3, freq="1D"),
        VALUE_COL: [2, 3, 4]
    })

    model.fit(X)
    predicted = model.predict(X)

    expected = pd.DataFrame({
        TIME_COL: X[TIME_COL],
        PREDICTED_COL: np.repeat(constant, X.shape[0])
    })

    assert predicted.equals(expected)
예제 #5
0
def test_fit_predict1():
    """Tests sample_weight parameter and different train/test set"""
    model = DummyEstimator()

    X = pd.DataFrame({
        TIME_COL:
        pd.date_range("2018-01-01", periods=3, freq="1D"),
        VALUE_COL: [2, 3, 5]
    })

    df_test = pd.DataFrame(
        {TIME_COL: pd.date_range("2018-01-01", periods=4, freq="1D")})

    model.fit(X, sample_weight=[1, 1, 2])
    predicted = model.predict(df_test)

    expected = pd.DataFrame({
        TIME_COL: df_test[TIME_COL],
        PREDICTED_COL: np.repeat(3.75, df_test.shape[0])
    })

    assert predicted.equals(expected)
def test_pipeline_union(X, fs):
    """Tests PandasFeatureUnion on a pipeline of transformers and estimator, and shows
     that null model extracted from estimator in pipeline is equivalent to null model trained
     directly"""
    model_estimator = Pipeline([
        ("input", fs),
        ("estimator", SimpleSilverkiteEstimator(score_func=mean_squared_error,
                                                coverage=0.80,
                                                null_model_params={"strategy": "mean"}))
    ])

    # fits pipeline with estimator, and extract dummy null model
    z_cutoff = 2.0
    model_estimator.set_params(input__response__outlier__z_cutoff=z_cutoff)
    model_estimator.fit(X)
    output_estimator_null = model_estimator.steps[-1][-1].null_model.predict(X)

    # fits pipeline with dummy estimator
    model_dummy = Pipeline([
        ("input", fs),
        ("dummy", DummyEstimator(score_func=mean_squared_error, strategy="mean"))
    ])
    model_dummy.fit(X)
    output_dummy = model_dummy.predict(X)

    # fits dummy estimator by hand, without Pipeline
    X_after_column_select = ColumnSelector([VALUE_COL]).fit_transform(X)
    X_after_z_score = ZscoreOutlierTransformer(z_cutoff=z_cutoff).fit_transform(X_after_column_select)
    X_after_null = NullTransformer().fit_transform(X_after_z_score)
    X_after_union = pd.concat([X[TIME_COL], X_after_null], axis=1)
    model_hand = DummyEstimator(strategy="mean")
    model_hand.fit(X_after_union)
    output_by_hand = model_hand.predict(X_after_union)

    assert output_estimator_null.equals(output_by_hand)
    assert output_dummy.equals(output_by_hand)
예제 #7
0
def test_fit_predict():
    """Tests training mean estimator"""
    model = DummyEstimator()
    X = pd.DataFrame({
        TIME_COL:
        pd.date_range("2018-01-01", periods=3, freq="1D"),
        VALUE_COL: [2, 3, 4]
    })

    model.fit(X)
    predicted = model.predict(X)

    expected = pd.DataFrame({
        TIME_COL: X[TIME_COL],
        PREDICTED_COL: np.repeat(3.0, X.shape[0])
    })

    assert predicted.equals(expected)

    # with np.nan value
    model = DummyEstimator()
    X = pd.DataFrame({
        TIME_COL:
        pd.date_range("2018-01-01", periods=4, freq="1D"),
        VALUE_COL: [2, 3, np.nan, 4]
    })

    model.fit(X)
    predicted = model.predict(X)

    expected = pd.DataFrame({
        TIME_COL: X[TIME_COL],
        PREDICTED_COL: np.repeat(3.0, X.shape[0])
    })

    assert predicted.equals(expected)
예제 #8
0
class BaseForecastEstimator(BaseEstimator, RegressorMixin, ABC):
    """A base class for forecast models. Fits a timeseries and predicts future values

    Parameters
    ----------
    score_func : callable, optional, default=mean_squared_error
        Function to calculate model R2_null_model_score score,
        with signature (actual, predicted).
        `actual`, `predicted` are array-like with the same shape.
        Smaller values are better.

    coverage : float, optional, default=0.95
        intended coverage of the prediction bands (0.0 to 1.0)
        If None, the upper/lower predictions are not returned by `predict`

        Every subclass must use `coverage` to set prediction band width. This ensures a common
        BaseForecastEstimator interface for parameters used during fitting and forecast evaluation

    null_model_params : dict with arguments passed to DummyRegressor, optional, default=None
        Dictionary keys must be in ("strategy", "constant", "quantile")
        Defines null model. model score is R2_null_model_score of model error relative to null model, evaluated by score_func
        If None, model score is score_func of the model itself

    Attributes
    ----------
    null_model : DummyEstimator
        null model used to measure model score
    time_col_ : str
        Name of input data time column
    value_col_ : str
        Name of input data value column
    last_predicted_X_ : `pandas.DataFrame` or None
        The ``X`` last passed to ``self.predict()``.
        Used to speed up predictions if the same ``X`` is passed repeatedly.
        Resets to None when ``self.fit()`` is called.
    cached_predictions_ : `pandas.DataFrame` or None
        The return value of the last call to ``self.predict()``.
        Used to speed up predictions if the same ``X`` is passed repeatedly.
        Resets to None when ``self.fit()`` is called.
    """
    @abstractmethod
    def __init__(self,
                 score_func=mean_squared_error,
                 coverage=0.95,
                 null_model_params=None):
        """Initializes attributes common to every BaseForecastEstimator

        Subclasses must also have these parameters. Every subclass must call:

            super().__init__(score_func=score_func, coverage=coverage, null_model_params=null_model_params)

        """
        self.score_func = score_func
        self.coverage = coverage
        self.null_model_params = null_model_params

        # initializes attributes defined in fit
        self.null_model = None
        self.time_col_ = None
        self.value_col_ = None

        # initializes attributes defined in predict
        self.last_predicted_X_ = None  # the most recent X passed to self.predict()
        self.cached_predictions_ = None  # the most recent return value of self.predict()

    @abstractmethod
    def fit(self,
            X,
            y=None,
            time_col=cst.TIME_COL,
            value_col=cst.VALUE_COL,
            **fit_params):
        """Fits a model to training data
        Also fits the null model, if specified, for use in evaluating the `score` function

        Every subclass must call this::
            super().fit(X, y=y, time_col=time_col, value_col=value_col, **fit_params)

        Parameters
        ----------
        X : `pandas.DataFrame`
            Input timeseries, with timestamp column,
            value column, and any additional regressors.
            The value column is the response, included in
            ``X`` to allow transformation by `sklearn.pipeline`.
        y : ignored
            The original timeseries values, ignored.
            (The y for fitting is included in X.)
        time_col : `str`
            Time column name in X.
        value_col : `str`
            Value column name in X.
        fit_params : `dict`
            Additional parameters supported by subclass `fit` or null model.
        """
        self.time_col_ = time_col  # to be used in `predict` to select proper column
        self.value_col_ = value_col
        # Null model must be initialized here, otherwise scikit-learn
        # grid search will not be able to set the parameters.
        # See https://scikit-learn.org/stable/developers/develop.html#instantiation.
        if self.null_model_params is not None:
            # Adds score function to null model parameters, and initializes null model
            self.null_model_params["score_func"] = self.score_func
            self.null_model = DummyEstimator(**self.null_model_params)
            # Passes `sample_weight` rather than `**fit_params` to avoid unexpected keyword argument from the main
            #   estimator's parameters
            sample_weight = fit_params.get("sample_weight")
            self.null_model.fit(X,
                                y=y,
                                time_col=time_col,
                                value_col=value_col,
                                sample_weight=sample_weight)

        # Clears the cached result, because it is no longer valid for the updated model
        self.last_predicted_X_ = None
        self.cached_predictions_ = None

    @abstractmethod
    def predict(self, X, y=None):
        """Creates forecast for dates specified in X

        To enable caching, every subclass must call this at the beginning
        of its ``.predict()``. Before returning the result, the subclass
        ``.predict()`` must set ``self.cached_predictions_`` to the return value.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Input timeseries with timestamp column and any additional regressors.
            Timestamps are the dates for prediction.
            Value column, if provided in X, is ignored.
        y : ignored

        Returns
        -------
        predictions : `pandas.DataFrame`
            Forecasted values for the dates in X. Columns:

                - TIME_COL dates
                - PREDICTED_COL predictions
                - PREDICTED_LOWER_COL lower bound of predictions, optional
                - PREDICTED_UPPER_COL upper bound of predictions, optional
                - [other columns], optional

            ``PREDICTED_LOWER_COL`` and ``PREDICTED_UPPER_COL`` are present
            if ``self.coverage`` is not None.
        """
        if self.cached_predictions_ is not None and X.equals(
                self.last_predicted_X_):
            log_message("Returning cached predictions.",
                        LoggingLevelEnum.DEBUG)
            return self.cached_predictions_
        else:
            # Updates `last_predicted_X` to the new value.
            # To enable caching, the subclass must set
            # `self.cached_predictions` to the returned result.
            self.last_predicted_X_ = X
            return None

    def summary(self):
        """Creates human readable string of how the model works, including relevant diagnostics
        These details cannot be extracted from the forecast alone
        Prints model configuration. Extend this in child class to print the trained model parameters.

        Log message is printed to the cst.LOGGER_NAME logger.
        """
        log_message(self,
                    LoggingLevelEnum.DEBUG)  # print model input parameters

    def score(self, X, y, sample_weight=None):
        """Default scorer for the estimator (Used in GridSearchCV/RandomizedSearchCV if scoring=None)

        Notes
        -----
        If null_model_params is not None, returns R2_null_model_score of model error
        relative to null model, evaluated by score_func.

        If null_model_params is None, returns score_func of the model itself.

        By default, grid search (with no `scoring` parameter) optimizes improvement of ``score_func``
        against null model.

        To optimize a different score function, pass `scoring` to GridSearchCV/RandomizedSearchCV.

        Parameters
        ----------
        X : `pandas.DataFrame`
            Input timeseries with timestamp column and any additional regressors.
            Value column, if provided in X, is ignored
        y : `pandas.Series` or  `numpy.array`
            Actual value, used to compute error
        sample_weight : `pandas.Series` or  `numpy.array`
            ignored

        Returns
        -------
        score : `float` or None
            Comparison of predictions against null predictions, according to specified score function
        """
        y_pred = self.predict(X)[cst.PREDICTED_COL]
        if self.null_model is not None:
            y_pred_null = self.null_model.predict(X)[cst.PREDICTED_COL]
            score = r2_null_model_score(y,
                                        y_pred,
                                        y_pred_null=y_pred_null,
                                        loss_func=self.score_func)
        else:
            score = self.score_func(y, y_pred)
        return score