def test_pipeline(): """Test results of TransformedTargetForecaster.""" y = load_airline() y_train, y_test = temporal_train_test_split(y) forecaster = TransformedTargetForecaster([ ("t1", ExponentTransformer()), ("t2", TabularToSeriesAdaptor(MinMaxScaler())), ("forecaster", NaiveForecaster()), ]) fh = np.arange(len(y_test)) + 1 forecaster.fit(y_train, fh=fh) actual = forecaster.predict() def compute_expected_y_pred(y_train, fh): # fitting yt = y_train.copy() t1 = ExponentTransformer() yt = t1.fit_transform(yt) t2 = TabularToSeriesAdaptor(MinMaxScaler()) yt = t2.fit_transform(yt) forecaster = NaiveForecaster() forecaster.fit(yt, fh=fh) # predicting y_pred = forecaster.predict() y_pred = t2.inverse_transform(y_pred) y_pred = t1.inverse_transform(y_pred) return y_pred expected = compute_expected_y_pred(y_train, fh) np.testing.assert_array_equal(actual, expected)
def test_predict_time_index_with_X(Forecaster, index_type, fh_type, is_relative, steps): """Check that predicted time index matches forecasting horizon.""" f = _construct_instance(Forecaster) n_columns_list = _get_n_columns(f.get_tag("scitype:y")) z, X = make_forecasting_problem(index_type=index_type, make_X=True) # Some estimators may not support all time index types and fh types, hence we # need to catch NotImplementedErrors. for n_columns in n_columns_list: f = _construct_instance(Forecaster) y = _make_series(n_columns=n_columns, index_type=index_type) cutoff = y.index[len(y) // 2] fh = _make_fh(cutoff, steps, fh_type, is_relative) y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, fh=fh) try: f.fit(y_train, X_train, fh=fh) y_pred = f.predict(X=X_test) _assert_correct_pred_time_index(y_pred.index, y_train.index[-1], fh) except NotImplementedError: pass
def test_update_predict_predicted_index( self, estimator_instance, n_columns, fh_int_oos, window_length, step_length, update_params, ): """Check predicted index in update_predict.""" y = _make_series(n_columns=n_columns, all_positive=True, index_type="datetime") y_train, y_test = temporal_train_test_split(y) cv = SlidingWindowSplitter( fh_int_oos, window_length=window_length, step_length=step_length, start_with_window=False, ) estimator_instance.fit(y_train, fh=fh_int_oos) y_pred = estimator_instance.update_predict(y_test, cv=cv, update_params=update_params) assert isinstance(y_pred, (pd.Series, pd.DataFrame)) expected = _get_expected_index_for_update_predict( y_test, fh_int_oos, step_length) actual = y_pred.index np.testing.assert_array_equal(actual, expected)
def test_pipeline(): y = load_airline() y_train, y_test = temporal_train_test_split(y) forecaster = TransformedTargetForecaster([ ("t1", Deseasonalizer(sp=12, model="multiplicative")), ("t2", Detrender(PolynomialTrendForecaster(degree=1))), ("forecaster", NaiveForecaster()), ]) fh = np.arange(len(y_test)) + 1 forecaster.fit(y_train, fh) actual = forecaster.predict() def compute_expected_y_pred(y_train, fh): # fitting yt = y_train.copy() t1 = Deseasonalizer(sp=12, model="multiplicative") yt = t1.fit_transform(yt) t2 = Detrender(PolynomialTrendForecaster(degree=1)) yt = t2.fit_transform(yt) forecaster = NaiveForecaster() forecaster.fit(yt, fh) # predicting y_pred = forecaster.predict() y_pred = t2.inverse_transform(y_pred) y_pred = t1.inverse_transform(y_pred) return y_pred expected = compute_expected_y_pred(y_train, fh) np.testing.assert_array_equal(actual, expected)
def test_weights_for_airline_averaging(): y = load_airline() y_train, y_test = temporal_train_test_split(y) forecaster = OnlineEnsembleForecaster([ ("ses", ExponentialSmoothing(seasonal="multiplicative", sp=12)), ( "holt", ExponentialSmoothing(trend="add", damped=False, seasonal="multiplicative", sp=12), ), ( "damped", ExponentialSmoothing(trend="add", damped=True, seasonal="multiplicative", sp=12), ), ]) forecaster.fit(y_train) expected = np.array([1 / 3, 1 / 3, 1 / 3]) np.testing.assert_allclose(forecaster.weights, expected, rtol=1e-8)
def test__y_and_cutoff(self, estimator_instance, n_columns): """Check cutoff and _y.""" # check _y and cutoff is None after construction f = estimator_instance y = _make_series(n_columns=n_columns) y_train, y_test = temporal_train_test_split(y, train_size=0.75) # check that _y and cutoff are empty when estimator is constructed assert f._y is None assert f.cutoff is None # check that _y and cutoff is updated during fit f.fit(y_train, fh=FH0) # assert isinstance(f._y, pd.Series) # action:uncomments the line above # why: fails for multivariates cause they are DataFrames # solution: look for a general solution for Series and DataFrames assert len(f._y) > 0 assert f.cutoff == y_train.index[-1] # check data pointers np.testing.assert_array_equal(f._y.index, y_train.index) # check that _y and cutoff is updated during update f.update(y_test, update_params=False) np.testing.assert_array_equal(f._y.index, np.append(y_train.index, y_test.index)) assert f.cutoff == y_test.index[-1]
def test_predict_time_index_with_X(self, estimator_instance, n_columns, index_fh_comb, fh_int_oos): """Check that predicted time index matches forecasting horizon.""" index_type, fh_type, is_relative = index_fh_comb if fh_type == "timedelta": return None # todo: ensure check_estimator works with pytest.skip like below # pytest.skip( # "ForecastingHorizon with timedelta values " # "is currently experimental and not supported everywhere" # ) z, X = make_forecasting_problem(index_type=index_type, make_X=True) # Some estimators may not support all time index types and fh types, hence we # need to catch NotImplementedErrors. y = _make_series(n_columns=n_columns, index_type=index_type) cutoff = y.index[len(y) // 2] fh = _make_fh(cutoff, fh_int_oos, fh_type, is_relative) y_train, _, X_train, X_test = temporal_train_test_split(y, X, fh=fh) try: estimator_instance.fit(y_train, X_train, fh=fh) y_pred = estimator_instance.predict(X=X_test) _assert_correct_pred_time_index(y_pred.index, y_train.index[-1], fh) except NotImplementedError: pass
def test_dummy_regressor_mean_prediction_endogenous_only( fh, window_length, strategy, scitype): # The DummyRegressor ignores the input feature data X, hence we can use it for # testing reduction from forecasting to both tabular and time series regression. # The DummyRegressor also supports the 'multioutput' strategy. y = make_forecasting_problem() fh = check_fh(fh) y_train, y_test = temporal_train_test_split(y, fh=fh) regressor = DummyRegressor(strategy="mean") forecaster = make_reduction(regressor, scitype=scitype, window_length=window_length, strategy=strategy) forecaster.fit(y_train, fh=fh) actual = forecaster.predict() if strategy == "recursive": # For the recursive strategy, we always use the first-step ahead as the # target vector in the regression problem during training, regardless of the # actual forecasting horizon. effective_window_length = window_length else: # For the other strategies, we split the data taking into account the steps # ahead we want to predict. effective_window_length = window_length + max(fh) - 1 # In the sliding-window transformation, the first values of the target series # make up the first window and are not used in the transformed target vector. So # the expected result should be the mean of the remaining values. expected = np.mean(y_train[effective_window_length:]) np.testing.assert_array_almost_equal(actual, expected)
def test_oh_setting(Forecaster): """Check cuttoff and _y.""" # check _y and cutoff is None after construction f = _construct_instance(Forecaster) n_columns_list = _get_n_columns(f.get_tag("scitype:y")) for n_columns in n_columns_list: f = _construct_instance(Forecaster) y = _make_series(n_columns=n_columns) y_train, y_test = temporal_train_test_split(y, train_size=0.75) assert f._y is None assert f.cutoff is None # check that _y and cutoff is updated during fit f.fit(y_train, fh=FH0) # assert isinstance(f._y, pd.Series) # action:uncomments the line above # why: fails for multivariates cause they are DataFrames # solution: look for a general solution for Series and DataFrames assert len(f._y) > 0 assert f.cutoff == y_train.index[-1] # check data pointers np.testing.assert_array_equal(f._y.index, y_train.index) # check that _y and cutoff is updated during update f.update(y_test, update_params=False) np.testing.assert_array_equal(f._y.index, np.append(y_train.index, y_test.index)) assert f.cutoff == y_test.index[-1]
def test_y_test_index_input(): y = make_forecasting_problem() y_train, y_test = temporal_train_test_split(y, train_size=0.75) # check if y_test.index can be passed as absolute horizon fh = FH(y_test.index, relative=False) cutoff = y_train.index[-1] np.testing.assert_array_equal(fh.relative(cutoff), np.arange(len(y_test)) + 1)
def test_forecaster_with_initial_level(): y = np.log1p(load_airline()) y_train, y_test = temporal_train_test_split(y) fh = np.arange(len(y_test)) + 1 f = ThetaForecaster(initial_level=0.1, sp=12) f.fit(y_train) y_pred = f.predict(fh=fh) np.testing.assert_allclose(y_pred, y_test, rtol=0.05)
def test_update_predict_single(self, estimator_instance, n_columns, fh_int_oos, update_params): """Check correct time index of update-predict.""" y = _make_series(n_columns=n_columns) y_train, y_test = temporal_train_test_split(y) estimator_instance.fit(y_train, fh=fh_int_oos) y_pred = estimator_instance.update_predict_single( y_test, update_params=update_params) _assert_correct_pred_time_index(y_pred.index, y_test.index[-1], fh_int_oos)
def test_predictive_performance_on_airline(): y = np.log1p(load_airline()) y_train, y_test = temporal_train_test_split(y) fh = np.arange(len(y_test)) + 1 f = ThetaForecaster(sp=12) f.fit(y_train) y_pred = f.predict(fh=fh) # Performance on this particular dataset should be reasonably good. np.testing.assert_allclose(y_pred, y_test, rtol=0.05)
def test_update_predict_single(Forecaster, fh, update_params): """Check correct time index of update-predict.""" f = _construct_instance(Forecaster) n_columns_list = _get_n_columns(f.get_tag("scitype:y")) for n_columns in n_columns_list: f = _construct_instance(Forecaster) y = _make_series(n_columns=n_columns) y_train, y_test = temporal_train_test_split(y) f.fit(y_train, fh=fh) y_pred = f.update_predict_single(y_test, update_params=update_params) _assert_correct_pred_time_index(y_pred.index, y_test.index[-1], fh)
def test_fh(index_type, fh_type, is_relative, steps): # generate data y = make_forecasting_problem(index_type=index_type) assert isinstance(y.index, INDEX_TYPE_LOOKUP.get(index_type)) # split data y_train, y_test = temporal_train_test_split(y, test_size=10) # choose cutoff point cutoff = y_train.index[-1] # generate fh fh = _make_fh(cutoff, steps, fh_type, is_relative) assert isinstance(fh.to_pandas(), INDEX_TYPE_LOOKUP.get(fh_type)) # get expected outputs if isinstance(steps, int): steps = np.array([steps]) fh_relative = pd.Int64Index(steps).sort_values() fh_absolute = y.index[np.where(y.index == cutoff)[0] + steps].sort_values() fh_indexer = fh_relative - 1 fh_oos = fh.to_pandas()[fh_relative > 0] is_oos = len(fh_oos) == len(fh) fh_ins = fh.to_pandas()[fh_relative <= 0] is_ins = len(fh_ins) == len(fh) # check outputs # check relative representation _assert_index_equal(fh_absolute, fh.to_absolute(cutoff).to_pandas()) assert not fh.to_absolute(cutoff).is_relative # check relative representation _assert_index_equal(fh_relative, fh.to_relative(cutoff).to_pandas()) assert fh.to_relative(cutoff).is_relative # check index-like representation _assert_index_equal(fh_indexer, fh.to_indexer(cutoff)) # check in-sample representation # we only compare the numpy array here because the expected solution is # formatted in a slightly different way than the generated solution np.testing.assert_array_equal( fh_ins.to_numpy(), fh.to_in_sample(cutoff).to_pandas() ) assert fh.to_in_sample(cutoff).is_relative == is_relative assert fh.is_all_in_sample(cutoff) == is_ins # check out-of-sample representation np.testing.assert_array_equal( fh_oos.to_numpy(), fh.to_out_of_sample(cutoff).to_pandas() ) assert fh.to_out_of_sample(cutoff).is_relative == is_relative assert fh.is_all_out_of_sample(cutoff) == is_oos
def test_update_predict_predicted_indices(Forecaster, fh, window_length, step_length, y): y_train, y_test = temporal_train_test_split(y) cv = SlidingWindowSplitter(fh, window_length=window_length, step_length=step_length) f = _construct_instance(Forecaster) f.fit(y_train, fh=fh) try: y_pred = f.update_predict(y_test, cv=cv) check_update_predict_y_pred(y_pred, y_test, fh, step_length) except NotImplementedError: pass
def test_pred_errors_against_y_test(fh): y = load_airline() y_train, y_test = temporal_train_test_split(y) f = ThetaForecaster() f.fit(y_train, fh) y_pred = f.predict(return_pred_int=False) errors = f._compute_pred_errors(alpha=0.1) if isinstance(errors, pd.Series): errors = [errors] # make iterable y_test = y_test.iloc[check_fh(fh) - 1] for error in errors: assert np.all(y_test > y_pred - error) assert np.all(y_test < y_pred + error)
def test_reductions_airline_data(forecaster, expected): """ test reduction forecasters by making prediction on airline dataset using linear estimators. predictions compared with values calculated by Lovkush Agarwal on their local machine in Mar 2021 """ y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) actual = forecaster.fit(y_train, fh=fh).predict(fh) np.testing.assert_almost_equal(actual, expected)
def calculate_smape(df_, regressor, forecast_horizon, window_length): df = df_.copy() df.fillna(method = 'ffill', inplace = True) y = df.iloc[:,-1].reset_index(drop=True) y_train, y_test = temporal_train_test_split(y, test_size = 12) fh = np.arange(y_test.shape[0]) + 1 regressor = select_regressor(regressor) forecaster = ReducedRegressionForecaster(regressor=regressor, window_length=window_length, strategy='recursive') forecaster.fit(y_train, fh=fh) y_pred = forecaster.predict(fh) return smape_loss(y_pred, y_test)
def test_factory_method_direct(): y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) regressor = LinearRegression() f1 = ReducedForecaster(regressor, scitype="regressor", strategy="direct") f2 = DirectRegressionForecaster(regressor) actual = f1.fit(y_train, fh=fh).predict(fh) expected = f2.fit(y_train, fh=fh).predict(fh) np.testing.assert_array_equal(actual, expected)
def test_split_by_fh(index_type, fh_type, is_relative, values): """Test temporal_train_test_split.""" if fh_type == "timedelta": return None # todo: ensure check_estimator works with pytest.skip like below # pytest.skip( # "ForecastingHorizon with timedelta values " # "is currently experimental and not supported everywhere" # ) y = _make_series(20, index_type=index_type) cutoff = y.index[10] fh = _make_fh(cutoff, values, fh_type, is_relative) split = temporal_train_test_split(y, fh=fh) _check_train_test_split_y(fh, split)
def test_update_predict_predicted_indices(Forecaster, fh, window_length, step_length): y = make_forecasting_problem(all_positive=True, index_type="datetime") y_train, y_test = temporal_train_test_split(y) cv = SlidingWindowSplitter(fh, window_length=window_length, step_length=step_length) f = _construct_instance(Forecaster) f.fit(y_train, fh=fh) try: y_pred = f.update_predict(y_test, cv=cv) _check_update_predict_y_pred(y_pred, y_test, fh, step_length) except NotImplementedError: pass
def test_score(self, estimator_instance, n_columns, fh_int_oos): """Check score method.""" y = _make_series(n_columns=n_columns) y_train, y_test = temporal_train_test_split(y) estimator_instance.fit(y_train, fh=fh_int_oos) y_pred = estimator_instance.predict() fh_idx = check_fh(fh_int_oos).to_indexer() # get zero based index expected = mean_absolute_percentage_error(y_pred, y_test.iloc[fh_idx], symmetric=True) # compare expected score with actual score actual = estimator_instance.score(y_test.iloc[fh_idx], fh=fh_int_oos) assert actual == expected
def graph_model_exp_smoothing(): if from_excel: y_train, y_test, y_pred = get_data_from_excel("ExpSmoothing.PM10") write_model_graph(y_train, y_test, y_pred, "Exponential Smoothing") else: ts: pd.DataFrame = get_time_series(get_engine(), "zurich", "Zch_Stampfenbachstrasse")[-1100:-900] ts.drop(columns=["date", "Zch_Stampfenbachstrasse.PM2.5"], inplace=True) ts_imputed = impute_simple_imputer(ts, False) ts_smooth = moving_average(ts_imputed, False) y, x = expsmoothing.transform_data(ts_smooth, False) y_train, y_test, x_train, x_test = temporal_train_test_split(y, x, test_size=fh) model = expsmoothing.train_model_expSmooting(y_train, x_train, False) y_pred = model.predict(X=x_test, fh=np.linspace(1, fh, fh)) write_model_graph(y_train, y_test, y_pred, "Exponential Smoothing")
def test_multioutput_direct_equivalence_tabular_linear_regression(fh): # multioutput and direct strategies with linear regression # regressor should produce same predictions y, X = make_forecasting_problem(make_X=True) y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, fh=fh) estimator = LinearRegression() direct = make_reduction(estimator, strategy="direct") multioutput = make_reduction(estimator, strategy="multioutput") y_pred_direct = direct.fit(y_train, X_train, fh=fh).predict(fh, X_test) y_pred_multioutput = multioutput.fit(y_train, X_train, fh=fh).predict(fh, X_test) np.testing.assert_array_almost_equal(y_pred_direct.to_numpy(), y_pred_multioutput.to_numpy())
def test_factory_method_ts_direct(): y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) ts_regressor = Pipeline([("tabularize", Tabularizer()), ("model", LinearRegression())]) f1 = ReducedForecaster(ts_regressor, scitype="ts_regressor", strategy="direct") f2 = DirectTimeSeriesRegressionForecaster(ts_regressor) actual = f1.fit(y_train, fh=fh).predict(fh) expected = f2.fit(y_train, fh=fh).predict(fh) np.testing.assert_array_equal(actual, expected)
def _check_update_predict_predicted_index(Forecaster, fh, window_length, step_length, update_params): y = make_forecasting_problem(all_positive=True, index_type="datetime") y_train, y_test = temporal_train_test_split(y) cv = SlidingWindowSplitter( fh, window_length=window_length, step_length=step_length, start_with_window=False, ) f = _construct_instance(Forecaster) f.fit(y_train, fh=fh) y_pred = f.update_predict(y_test, cv=cv, update_params=update_params) assert isinstance(y_pred, (pd.Series, pd.DataFrame)) expected = _get_expected_index_for_update_predict(y_test, fh, step_length) actual = y_pred.index np.testing.assert_array_equal(actual, expected)
def test_VAR_against_statsmodels(): """Compares Sktime's and Statsmodel's VAR.""" train, test = temporal_train_test_split(df) sktime_model = VAR() fh = ForecastingHorizon([1, 3, 4, 5, 7, 9]) sktime_model.fit(train) y_pred = sktime_model.predict(fh=fh) stats = _VAR(train) stats_fit = stats.fit() fh_int = fh.to_relative(train.index[-1]) lagged = stats_fit.k_ar y_pred_stats = stats_fit.forecast(train.values[-lagged:], steps=fh_int[-1]) new_arr = [] for i in fh_int: new_arr.append(y_pred_stats[i - 1]) assert_allclose(y_pred, new_arr)
def prepare_model(self, timeseries: pd.DataFrame, output: bool = True) -> AutoARIMA: if output: logger.info("Running script...") y, x = transform_data(timeseries, output) y_train, y_test, x_train, x_test = temporal_train_test_split( y, x, test_size=0.1) self.model = train_model_autoarima(y_train, x_train, output) y_test = pd.Series(data=np.delete(y_test, 0)) x_test = pd.DataFrame(data=x_test[:-1]) score = eval_model_mape(self.model, y_test, x_test, output) if output: logger.info(f"Score of model: {score:.04f}") logger.info(f"Completed script in {timer_script}") return self.model
def test_skip_inverse_transform(): """Test transformers with skip-inverse-transform tag in pipeline.""" y = load_airline() # add nan and outlier y.iloc[3] = np.nan y.iloc[4] = y.iloc[4] * 20 y_train, y_test = temporal_train_test_split(y) forecaster = TransformedTargetForecaster([ ("t1", HampelFilter(window_length=12)), ("t2", Imputer(method="mean")), ("forecaster", NaiveForecaster()), ]) fh = np.arange(len(y_test)) + 1 forecaster.fit(y_train, fh=fh) y_pred = forecaster.predict() assert isinstance(y_pred, pd.Series)