Пример #1
0
def test_get_best_not_failing_model(X_y_optional, negative_data, best_model_name, rank, expected_error):
    X, y = X_y_optional
    # data contains 0
    y[y < 1] = 1
    if negative_data:
        y[-1] = -1
    models = [
        ExponentialSmoothingWrapper(freq="D", trend="mul"),
        get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=-5000),
    ]
    models = models if expected_error is None else models[:1]
    grid_search = GridSearchCV(
        estimator=Pipeline([("model", "passthrough")]),
        param_grid=[{"model": models}],
        scoring=get_scorer("neg_mean_absolute_error"),
        cv=FinerTimeSplit(n_splits=1, horizon=5),
        refit=False,
        error_score=np.nan,
    )

    grid_search.fit(X, y)

    if expected_error:
        with pytest.raises(expected_error):
            get_best_not_failing_model(grid_search, X, y)
    else:
        best_param_rank = get_best_not_failing_model(grid_search, X, y)
        assert isinstance(best_param_rank, dict)
        assert best_param_rank["params"]["model"].__class__.__name__ == best_model_name
        assert best_param_rank["rank"] == rank
def test_cv_finertimesplit_split_pandas_container_data(ts_data, expected_error):

    n_splits = 2
    horizon = 3
    fts = FinerTimeSplit(n_splits=n_splits, horizon=horizon)
    if expected_error is None:
        result = fts.split(ts_data)
        assert isinstance(result, types.GeneratorType)
        result = list(result)
        assert len(result) == n_splits
        for i, isplit in enumerate(result):
            assert len(isplit) == 2
            assert len(isplit[0]) == len(ts_data) - (n_splits - i) * horizon
            assert len(isplit[1]) == horizon
            assert np.array_equal(isplit[0], np.arange(len(ts_data) - (n_splits - i) * horizon))
            assert np.array_equal(isplit[1], np.arange(horizon) + len(ts_data) - (n_splits - i) * horizon)
    else:
        with pytest.raises(expected_error):
            _ = list(fts.split(ts_data))
def test_cv_finertimesplit_split_input_data_types(test_data, expected_error):

    n_splits = 2
    horizon = 3
    fts = FinerTimeSplit(n_splits=n_splits, horizon=horizon)
    if expected_error is None:
        result = list(fts.split(test_data))
        assert len(result) == n_splits
        for i, isplit in enumerate(result):
            assert len(isplit) == 2
            assert len(isplit[0]) == len(test_data) - (n_splits - i) * horizon
            assert len(isplit[1]) == horizon
            assert np.array_equal(isplit[0], np.arange(len(test_data) - (n_splits - i) * horizon))
            assert np.array_equal(
                isplit[1],
                np.arange(horizon) + len(test_data) - (n_splits - i) * horizon,
            )
    else:
        with pytest.raises(expected_error):
            _ = list(fts.split(test_data))
Пример #4
0
def grid_search(request):
    from sklearn.dummy import DummyRegressor
    from sklearn.metrics import mean_absolute_error
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline

    from hcrystalball.feature_extraction import HolidayTransformer
    from hcrystalball.feature_extraction import SeasonalityTransformer
    from hcrystalball.metrics import make_ts_scorer
    from hcrystalball.model_selection import FinerTimeSplit
    from hcrystalball.wrappers import get_sklearn_wrapper

    scoring = make_ts_scorer(mean_absolute_error, greater_is_better=False)

    bad_dummy = get_sklearn_wrapper(DummyRegressor,
                                    strategy="constant",
                                    constant=42,
                                    name="bad_dummy",
                                    lags=2)
    good_dummy = get_sklearn_wrapper(DummyRegressor,
                                     strategy="mean",
                                     name="good_dummy",
                                     lags=2)

    parameters = [
        {
            "model": [good_dummy]
        },
        {
            "model": [bad_dummy],
            "model__strategy": ["constant"],
            "model__constant": [42],
        },
    ]

    holiday_model = Pipeline([
        ("holiday", HolidayTransformer(country_code_column="Holidays_code")),
        ("seasonality", SeasonalityTransformer(week_day=True, freq="D")),
        ("model", good_dummy),
    ])
    cv = FinerTimeSplit(n_splits=2, horizon=5)
    grid_search = GridSearchCV(holiday_model,
                               parameters,
                               cv=cv,
                               scoring=scoring)

    return grid_search
    def fit(self, X, y=None):
        """Fit the stacking ensemble model

        Parameters
        ----------
        X: pandas.DataFrame
            Input features.

        y: numpy.ndarray
            Target vector.

        Returns
        -------
        StackingEnsemble
            A fitted StackingEnsemble instance
        """
        self._check_base_learners_names(self.base_learners)

        # Fit the base learners and the meta_model
        if (not self.fitted) or self.fit_meta_model_always:
            splitter = FinerTimeSplit(horizon=self.train_horizon,
                                      n_splits=self.train_n_splits)

            n_train_meta = self.train_n_splits * self.train_horizon
            X_meta = pd.DataFrame(
                index=X.index[-n_train_meta:],
                columns=[get_estimator_name(bl) for bl in self.base_learners],
            )
            y_meta = y[-n_train_meta:]
            # Get base learners predictions
            for ind_train, ind_pred in splitter.split(X):
                X_train = X.iloc[ind_train, :]
                X_pred = X.iloc[ind_pred, :]
                y_train = y[ind_train]

                self._fit_base_learners(X_train, y_train)
                X_meta.loc[
                    X_pred.index, :] = self._predict_features_for_meta_models(
                        X_pred)
            # Add dummy horizon variable for meta model
            if self.horizons_as_features:
                X_meta = pd.concat(
                    [
                        X_meta,
                        self._create_horizons_as_features(
                            cross_results_index=X_meta.index,
                            horizon=self.train_horizon,
                            n_splits=self.train_n_splits,
                        ),
                    ],
                    axis=1,
                )
            if self.weekdays_as_features:
                X_meta = pd.concat(
                    [
                        X_meta,
                        self._create_weekdays_as_features(
                            cross_results_index=X_meta.index)
                    ],
                    axis=1,
                )

            self._fit_columns = X_meta.columns
            self.meta_model.fit(X_meta.values, y_meta)

        # Fit the base learners on the whole training set
        self._fit_base_learners(X, y)
        self.fitted = True

        return self