def test_get_best_not_failing_model(X_y_optional, negative_data, best_model_name, rank, expected_error): X, y = X_y_optional # data contains 0 y[y < 1] = 1 if negative_data: y[-1] = -1 models = [ ExponentialSmoothingWrapper(freq="D", trend="mul"), get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=-5000), ] models = models if expected_error is None else models[:1] grid_search = GridSearchCV( estimator=Pipeline([("model", "passthrough")]), param_grid=[{"model": models}], scoring=get_scorer("neg_mean_absolute_error"), cv=FinerTimeSplit(n_splits=1, horizon=5), refit=False, error_score=np.nan, ) grid_search.fit(X, y) if expected_error: with pytest.raises(expected_error): get_best_not_failing_model(grid_search, X, y) else: best_param_rank = get_best_not_failing_model(grid_search, X, y) assert isinstance(best_param_rank, dict) assert best_param_rank["params"]["model"].__class__.__name__ == best_model_name assert best_param_rank["rank"] == rank
def test_cv_finertimesplit_split_pandas_container_data(ts_data, expected_error): n_splits = 2 horizon = 3 fts = FinerTimeSplit(n_splits=n_splits, horizon=horizon) if expected_error is None: result = fts.split(ts_data) assert isinstance(result, types.GeneratorType) result = list(result) assert len(result) == n_splits for i, isplit in enumerate(result): assert len(isplit) == 2 assert len(isplit[0]) == len(ts_data) - (n_splits - i) * horizon assert len(isplit[1]) == horizon assert np.array_equal(isplit[0], np.arange(len(ts_data) - (n_splits - i) * horizon)) assert np.array_equal(isplit[1], np.arange(horizon) + len(ts_data) - (n_splits - i) * horizon) else: with pytest.raises(expected_error): _ = list(fts.split(ts_data))
def test_cv_finertimesplit_split_input_data_types(test_data, expected_error): n_splits = 2 horizon = 3 fts = FinerTimeSplit(n_splits=n_splits, horizon=horizon) if expected_error is None: result = list(fts.split(test_data)) assert len(result) == n_splits for i, isplit in enumerate(result): assert len(isplit) == 2 assert len(isplit[0]) == len(test_data) - (n_splits - i) * horizon assert len(isplit[1]) == horizon assert np.array_equal(isplit[0], np.arange(len(test_data) - (n_splits - i) * horizon)) assert np.array_equal( isplit[1], np.arange(horizon) + len(test_data) - (n_splits - i) * horizon, ) else: with pytest.raises(expected_error): _ = list(fts.split(test_data))
def grid_search(request): from sklearn.dummy import DummyRegressor from sklearn.metrics import mean_absolute_error from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from hcrystalball.feature_extraction import HolidayTransformer from hcrystalball.feature_extraction import SeasonalityTransformer from hcrystalball.metrics import make_ts_scorer from hcrystalball.model_selection import FinerTimeSplit from hcrystalball.wrappers import get_sklearn_wrapper scoring = make_ts_scorer(mean_absolute_error, greater_is_better=False) bad_dummy = get_sklearn_wrapper(DummyRegressor, strategy="constant", constant=42, name="bad_dummy", lags=2) good_dummy = get_sklearn_wrapper(DummyRegressor, strategy="mean", name="good_dummy", lags=2) parameters = [ { "model": [good_dummy] }, { "model": [bad_dummy], "model__strategy": ["constant"], "model__constant": [42], }, ] holiday_model = Pipeline([ ("holiday", HolidayTransformer(country_code_column="Holidays_code")), ("seasonality", SeasonalityTransformer(week_day=True, freq="D")), ("model", good_dummy), ]) cv = FinerTimeSplit(n_splits=2, horizon=5) grid_search = GridSearchCV(holiday_model, parameters, cv=cv, scoring=scoring) return grid_search
def fit(self, X, y=None): """Fit the stacking ensemble model Parameters ---------- X: pandas.DataFrame Input features. y: numpy.ndarray Target vector. Returns ------- StackingEnsemble A fitted StackingEnsemble instance """ self._check_base_learners_names(self.base_learners) # Fit the base learners and the meta_model if (not self.fitted) or self.fit_meta_model_always: splitter = FinerTimeSplit(horizon=self.train_horizon, n_splits=self.train_n_splits) n_train_meta = self.train_n_splits * self.train_horizon X_meta = pd.DataFrame( index=X.index[-n_train_meta:], columns=[get_estimator_name(bl) for bl in self.base_learners], ) y_meta = y[-n_train_meta:] # Get base learners predictions for ind_train, ind_pred in splitter.split(X): X_train = X.iloc[ind_train, :] X_pred = X.iloc[ind_pred, :] y_train = y[ind_train] self._fit_base_learners(X_train, y_train) X_meta.loc[ X_pred.index, :] = self._predict_features_for_meta_models( X_pred) # Add dummy horizon variable for meta model if self.horizons_as_features: X_meta = pd.concat( [ X_meta, self._create_horizons_as_features( cross_results_index=X_meta.index, horizon=self.train_horizon, n_splits=self.train_n_splits, ), ], axis=1, ) if self.weekdays_as_features: X_meta = pd.concat( [ X_meta, self._create_weekdays_as_features( cross_results_index=X_meta.index) ], axis=1, ) self._fit_columns = X_meta.columns self.meta_model.fit(X_meta.values, y_meta) # Fit the base learners on the whole training set self._fit_base_learners(X, y) self.fitted = True return self