def _make_fh(cutoff, steps, fh_type, is_relative): """Construct forecasting horizons for testing.""" from sktime.forecasting.tests._config import INDEX_TYPE_LOOKUP fh_class = INDEX_TYPE_LOOKUP[fh_type] if isinstance(steps, (int, np.integer)): steps = np.array([steps], dtype=int) elif isinstance(steps, pd.Timedelta): steps = [steps] if is_relative: return ForecastingHorizon(fh_class(steps), is_relative=is_relative) else: kwargs = {} if fh_type == "datetime": steps *= cutoff.freq if fh_type == "period": kwargs = {"freq": cutoff.freq} values = cutoff + steps return ForecastingHorizon(fh_class(values, **kwargs), is_relative)
def _inverse_transform(self, X, y=None): """Logic used by `inverse_transform` to reverse transformation on `X`. Parameters ---------- X : pd.Series or pd.DataFrame Data to be inverse transformed y : ignored argument for interface compatibility Additional data, e.g., labels for transformation Returns ------- Xt : pd.Series or pd.DataFrame, same type as X inverse transformed version of X """ Z = X is_df = isinstance(Z, pd.DataFrame) is_contained_by_fit_z, pad_z_inv = self._check_inverse_transform_index( Z) # If `Z` is entirely contained in fitted `_Z` we can just return # the values from the timeseires stored in `fit` as a shortcut if is_contained_by_fit_z: Z_inv = self._Z.loc[Z.index, :] if is_df else self._Z.loc[Z.index] else: Z_inv = Z.copy() for i, lag_info in enumerate( zip(self._lags[::-1], self._prior_cum_lags[::-1])): lag, prior_cum_lag = lag_info _lags = self._lags[::-1][i + 1:] _transformed = _diff_transform(self._Z, _lags) # Determine index values for initial values needed to reverse # the differencing for the specified lag if pad_z_inv: cutoff = Z_inv.index[0] else: cutoff = Z_inv.index[prior_cum_lag + lag] fh = ForecastingHorizon(np.arange(-1, -(lag + 1), -1)) index = fh.to_absolute(cutoff).to_pandas() if is_df: prior_n_timepoint_values = _transformed.loc[index, :] else: prior_n_timepoint_values = _transformed.loc[index] if pad_z_inv: Z_inv = pd.concat([prior_n_timepoint_values, Z_inv]) else: Z_inv.update(prior_n_timepoint_values) Z_inv = _inverse_diff(Z_inv, lag) if pad_z_inv: Z_inv = Z_inv.loc[Z.index, :] if is_df else Z_inv.loc[Z.index] Xt = Z_inv return Xt
def test_to_absolute_freq(freqstr): """Test conversion when anchorings included in frequency.""" train = pd.Series(1, index=pd.date_range("2021-10-06", freq=freqstr, periods=3)) fh = ForecastingHorizon([1, 2, 3]) abs_fh = fh.to_absolute(train.index[-1]) assert abs_fh._values.freqstr == freqstr
def test_to_absolute_int(idx: int, freq: str): """Test converting between relative and absolute.""" # Converts from relative to absolute and back to relative train = pd.Series(1, index=pd.date_range("2021-10-06", freq=freq, periods=5)) fh = ForecastingHorizon([1, 2, 3]) absolute_int = fh.to_absolute_int(start=train.index[0], cutoff=train.index[idx]) assert_array_equal(fh + idx, absolute_int)
def _inverse_transform(self, Z, X=None): """Logic used by `inverse_transform` to reverse transformation on `Z`. Parameters ---------- Z : pd.Series or pd.DataFrame A time series to reverse the transformation on. Returns ------- Z_inv : pd.Series or pd.DataFrame The reconstructed timeseries after the transformation has been reversed. """ is_df = isinstance(Z, pd.DataFrame) is_contained_by_fit_z, pad_z_inv = self._check_inverse_transform_index( Z) # If `Z` is entirely contained in fitted `_Z` we can just return # the values from the timeseires stored in `fit` as a shortcut if is_contained_by_fit_z: Z_inv = self._Z.loc[Z.index, :] if is_df else self._Z.loc[Z.index] else: Z_inv = Z.copy() for i, lag_info in enumerate( zip(self._lags[::-1], self._prior_cum_lags[::-1])): lag, prior_cum_lag = lag_info _lags = self._lags[::-1][i + 1:] _transformed = _diff_transform(self._Z, _lags) # Determine index values for initial values needed to reverse # the differencing for the specified lag if pad_z_inv: cutoff = Z_inv.index[0] else: cutoff = Z_inv.index[prior_cum_lag + lag] fh = ForecastingHorizon(np.arange(-1, -(lag + 1), -1)) index = fh.to_absolute(cutoff).to_pandas() if is_df: prior_n_timepoint_values = _transformed.loc[index, :] else: prior_n_timepoint_values = _transformed.loc[index] if pad_z_inv: Z_inv = pd.concat([prior_n_timepoint_values, Z_inv]) else: Z_inv.update(prior_n_timepoint_values) Z_inv = _inverse_diff(Z_inv, lag) if pad_z_inv: Z_inv = Z_inv.loc[Z.index, :] if is_df else Z_inv.loc[Z.index] return Z_inv
def test_to_relative(freq: str): """Test conversion to relative. Fixes bug in https://github.com/alan-turing-institute/sktime/issues/1935#issue-1114814142 """ freq = "2H" t = pd.date_range(start="2021-01-01", freq=freq, periods=5) fh_abs = ForecastingHorizon(t, is_relative=False) fh_rel = fh_abs.to_relative(cutoff=t.min()) assert_array_equal(fh_rel, np.arange(5))
def test_relative_to_relative(freqstr): """Test converting between relative and absolute.""" # Converts from relative to absolute and back to relative train = pd.Series(1, index=pd.date_range("2021-10-06", freq=freqstr, periods=3)) fh = ForecastingHorizon([1, 2, 3]) abs_fh = fh.to_absolute(train.index[-1]) converted_rel_fh = abs_fh.to_relative(train.index[-1]) assert_array_equal(fh, converted_rel_fh)
def test_estimator_fh(freqstr): """Test model fitting with anchored frequency.""" train = pd.Series( np.random.uniform(low=2000, high=7000, size=(104, )), index=pd.date_range("2019-01-02", freq=freqstr, periods=104), ) forecaster = AutoETS(auto=True, sp=52, n_jobs=-1, restrict=True) forecaster.fit(train) pred = forecaster.predict(np.arange(1, 27)) expected_fh = ForecastingHorizon(np.arange(1, 27)).to_absolute( train.index[-1]) assert_array_equal(pred.index.to_numpy(), expected_fh.to_numpy())
def test_absolute_to_absolute_with_integer_horizon(freqstr): """Test converting between absolute and relative with integer horizon.""" # Converts from absolute to relative and back to absolute train = pd.Series(1, index=pd.date_range("2021-10-06", freq=freqstr, periods=3)) fh = ForecastingHorizon([1, 2, 3]) abs_fh = fh.to_absolute(train.index[-1]) converted_abs_fh = abs_fh.to_relative(train.index[-1]).to_absolute( train.index[-1]) assert_array_equal(abs_fh, converted_abs_fh) assert converted_abs_fh._values.freqstr == freqstr
def test_relative_to_relative_with_timedelta_horizon(freqstr): """Test converting between relative and absolute with timedelta horizons.""" # Converts from relative to absolute and back to relative train = pd.Series(1, index=pd.date_range("2021-10-06", freq=freqstr, periods=3)) count, unit = _get_intervals_count_and_unit(freq=freqstr) fh = ForecastingHorizon( pd.timedelta_range(pd.to_timedelta(count, unit=unit), freq=freqstr, periods=3)) abs_fh = fh.to_absolute(train.index[-1]) converted_rel_fh = abs_fh.to_relative(train.index[-1]) assert_array_equal(converted_rel_fh, np.arange(1, 4))
def _get_end(y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon) -> int: """Compute the end of the last training window for a forecasting horizon. Parameters ---------- y : pd.Series, pd.DataFrame, np.ndarray, or pd.Index coerced and checked version of input y fh : int, timedelta, list or np.array of ints or timedeltas Returns ------- end : int end of the training window """ # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by # `check_window_length`. n_timepoints = y.shape[0] # For purely in-sample forecasting horizons, the last split point is the end of the # training data. if fh.is_all_in_sample(): end = n_timepoints + 1 # Otherwise, the last point must ensure that the last horizon is within the data. else: fh_max = fh[-1] end = n_timepoints - fh_max + 1 return end
def _get_end(y_index: pd.Index, fh: ForecastingHorizon) -> int: """Compute the end of the last training window for a forecasting horizon. For a time series index `y_index`, `y_index[end]` will give the index of the training window. Correspondingly, for a time series `y` with index `y_index`, `y.iloc[end]` or `y.loc[y_index[end]]` will provide the last index of the training window. Parameters ---------- y_index : pd.Index Index of time series fh : int, timedelta, list or np.ndarray of ints or timedeltas Returns ------- end : int 0-indexed integer end of the training window """ # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by # `check_window_length`. n_timepoints = y_index.shape[0] assert isinstance(y_index, pd.Index) # For purely in-sample forecasting horizons, the last split point is the end of the # training data. # Otherwise, the last point must ensure that the last horizon is within the data. null = 0 if array_is_int(fh) else pd.Timedelta(0) fh_offset = null if fh.is_all_in_sample() else fh[-1] if array_is_int(fh): return n_timepoints - fh_offset - 1 else: return y_index.get_loc(y_index[-1] - fh_offset)
def _impute_with_forecaster(forecaster, Z): """Use a given forecaster for imputation by in-sample predictions. Parameters ---------- forecaster: Forecaster Forecaster to use for imputation Z : pd.Series or pd.DataFrame Series to impute. Returns ------- zt : pd.Series or pd.DataFrame Series with imputed values. """ if isinstance(Z, pd.Series): series = [Z] elif isinstance(Z, pd.DataFrame): series = [Z[column] for column in Z] for z in series: if _has_missing_values(z): # define fh based on index of missing values na_index = z.index[z.isna()] fh = ForecastingHorizon(values=na_index, is_relative=False) # fill NaN before fitting with ffill and backfill (heuristic) forecaster.fit( y=z.fillna(method="ffill").fillna(method="backfill"), fh=fh) # replace missing values with predicted values z[na_index] = forecaster.predict() return Z
def _transform(self, X, y=None): """Transform X and return a transformed version. private _transform containing the core logic, called from transform Parameters ---------- X : pd.Series or pd.DataFrame Data to be transformed y : ignored argument for interface compatibility Additional data, e.g., labels for transformation Returns ------- theta_lines: pd.Series or pd.DataFrame Transformed series pd.Series, with single Theta-line, if self.theta is float pd.DataFrame of shape: [len(X), len(self.theta)], if self.theta is tuple """ z = X theta = _check_theta(self.theta) forecaster = PolynomialTrendForecaster() forecaster.fit(z) fh = ForecastingHorizon(z.index, is_relative=False) trend = forecaster.predict(fh) theta_lines = np.zeros((z.shape[0], len(theta))) for i, theta in enumerate(theta): theta_lines[:, i] = _theta_transform(z, trend, theta) if isinstance(self.theta, (float, int)): return pd.Series(theta_lines.flatten(), index=z.index) else: return pd.DataFrame(theta_lines, columns=self.theta, index=z.index)
def transform(self, Z, X=None): """Transform data. Parameters ---------- Z : pd.Series Series to transform. X : pd.DataFrame, optional (default=None) Exogenous data used in transformation. Returns ------- theta_lines: ndarray or pd.DataFrame Transformed series: single Theta-line or a pd.DataFrame of shape: len(Z)*len(self.theta). """ self.check_is_fitted() z = check_series(Z, enforce_univariate=True) theta = _check_theta(self.theta) forecaster = PolynomialTrendForecaster() forecaster.fit(z) fh = ForecastingHorizon(z.index, is_relative=False) trend = forecaster.predict(fh) theta_lines = np.zeros((z.shape[0], len(theta))) for i, theta in enumerate(theta): theta_lines[:, i] = _theta_transform(z, trend, theta) if isinstance(self.theta, (float, int)): return pd.Series(theta_lines.flatten(), index=z.index) else: return pd.DataFrame(theta_lines, columns=self.theta, index=z.index)
def check_fh(fh, enforce_relative=False): """Validate forecasting horizon. Parameters ---------- fh : int, list, np.array, pd.Index or ForecastingHorizon Forecasting horizon specifying the time points to predict. enforce_relative : bool, optional (default=False) If True, checks if fh is relative. Returns ------- fh : ForecastingHorizon Validated forecasting horizon. """ # Convert to ForecastingHorizon from sktime.forecasting.base import ForecastingHorizon if not isinstance(fh, ForecastingHorizon): fh = ForecastingHorizon(fh, is_relative=True) # Check if non-empty, note we check for empty values here, rather than # during construction of ForecastingHorizon because ForecastingHorizon # can be empty in some cases, but users should not create forecasting horizons # with no values if len(fh) == 0: raise ValueError(f"`fh` must not be empty, but found: {fh}") if enforce_relative and not fh.is_relative: raise ValueError("`fh` must be relative, but found absolute `fh`") return fh
def test_VAR_against_statsmodels(): """Compares Sktime's and Statsmodel's VAR.""" train, test = temporal_train_test_split(df) sktime_model = VAR() fh = ForecastingHorizon([1, 3, 4, 5, 7, 9]) sktime_model.fit(train) y_pred = sktime_model.predict(fh=fh) stats = _VAR(train) stats_fit = stats.fit() fh_int = fh.to_relative(train.index[-1]) lagged = stats_fit.k_ar y_pred_stats = stats_fit.forecast(train.values[-lagged:], steps=fh_int[-1]) new_arr = [] for i in fh_int: new_arr.append(y_pred_stats[i - 1]) assert_allclose(y_pred, new_arr)
def test_window_splitter_in_sample_fh_greater_than_window_length(CV): y = np.arange(10) fh = ForecastingHorizon([-5, -3]) window_length = 3 cv = CV(fh, window_length) train_windows, test_windows, cutoffs, n_splits = _check_cv(cv, y) np.testing.assert_array_equal(test_windows[0], np.array([0, 2])) np.testing.assert_array_equal(train_windows[0], np.array([3, 4, 5]))
def _transform(self, y, X=None): # For the recursive strategy, the forecasting horizon for the sliding-window # transform is simply a one-step ahead horizon, regardless of the horizon # used during prediction. fh = ForecastingHorizon([1]) return _sliding_window_transform( y, self.window_length_, fh, X, scitype=self._estimator_scitype )
def test_window_splitter_in_sample_fh_smaller_than_window_length(CV): """Test WindowSplitter.""" y = np.arange(10) fh = ForecastingHorizon([-2, 0]) window_length = 3 cv = CV(fh, window_length) train_windows, test_windows, cutoffs, n_splits = _check_cv(cv, y) np.testing.assert_array_equal(test_windows[0], np.array([0, 2])) np.testing.assert_array_equal(train_windows[0], np.array([0, 1, 2]))
def test_strategy_mean_seasonal_additional_combinations(n, window_length, sp): """Check time series of n * window_length with a 1:n-1 train/test split, for different combinations of the period and seasonal periodicity. The time series contains perfectly cyclic data. """ # given <window_length> hours of data with a seasonal periodicity of <sp> hours freq = pd.Timedelta("1H") data = pd.Series( index=pd.date_range("2021-06-01 00:00", periods=n * window_length, freq=freq, closed="left"), data=([float(i) for i in range(1, sp + 1)] * n * window_length)[:n * window_length], ) # Split into train and test data train_data = data[:window_length] test_data = data[window_length:] # Forecast data does not retain the original frequency test_data.index.freq = None # For example, for n=2, periods=4 and sp=3: # print(train_data) # 2021-06-01 00:00:00 1.0 # 2021-06-01 01:00:00 2.0 # 2021-06-01 02:00:00 3.0 # 2021-06-01 03:00:00 1.0 # Freq: H, dtype: int64 # print(test_data) # 2021-06-01 04:00:00 2.0 # (value of 3 hours earlier) # 2021-06-01 05:00:00 3.0 # (value of 3 hours earlier) # 2021-06-01 06:00:00 1.0 # (mean value of 3 and 6 hours earlier) # 2021-06-01 07:00:00 2.0 # (value of 6 hours earlier) # dtype: float64 # let's forecast the next <2 x period> hours with a periodicity of <sp> hours fh = ForecastingHorizon(test_data.index, is_relative=False) model = NaiveForecaster(strategy="mean", sp=sp) model.fit(train_data) forecast_data = model.predict(fh) if sp < window_length: # We expect a perfect forecast given our perfectly cyclic data pd.testing.assert_series_equal(forecast_data, test_data) else: # We expect a few forecasts yield NaN values for i in range(1 + len(test_data) // sp): test_data[i * sp:i * sp + sp - window_length] = np.nan pd.testing.assert_series_equal(forecast_data, test_data)
def test_auto_arima(): """Test bug in 805. https://github.com/alan-turing-institute/sktime/issues/805#issuecomment-891848228. """ time_index = pd.date_range("January 1, 2021", periods=8, freq="1D") X = pd.DataFrame( np.random.randint(0, 4, 24).reshape(8, 3), columns=["First", "Second", "Third"], index=time_index, ) y = pd.Series([1, 3, 2, 4, 5, 2, 3, 1], index=time_index) fh_ = ForecastingHorizon(X.index[5:], is_relative=False) a_clf = AutoARIMA(start_p=2, start_q=2, max_p=5, max_q=5) clf = a_clf.fit(X=X[:5], y=y[:5]) y_pred_sk = clf.predict(fh=fh_, X=X[5:]) pd.testing.assert_index_equal( y_pred_sk.index, pd.date_range("January 6, 2021", periods=3, freq="1D")) time_index = pd.date_range("January 1, 2021", periods=8, freq="2D") X = pd.DataFrame( np.random.randint(0, 4, 24).reshape(8, 3), columns=["First", "Second", "Third"], index=time_index, ) y = pd.Series([1, 3, 2, 4, 5, 2, 3, 1], index=time_index) fh = ForecastingHorizon(X.index[5:], is_relative=False) a_clf = AutoARIMA(start_p=2, start_q=2, max_p=5, max_q=5) clf = a_clf.fit(X=X[:5], y=y[:5]) y_pred_sk = clf.predict(fh=fh, X=X[5:]) pd.testing.assert_index_equal( y_pred_sk.index, pd.date_range("January 11, 2021", periods=3, freq="2D"))
def _check_inverse_transform_index(self, Z): """Check fitted series contains indices needed in inverse_transform.""" first_idx = Z.index.min() orig_first_idx, orig_last_idx = self._Z.index.min(), self._Z.index.max( ) is_contained_by_fitted_z = False is_future = False if first_idx < orig_first_idx: msg = [ "Some indices of `Z` are prior to timeseries used in `fit`.", "Reconstruction via `inverse_transform` is not possible.", ] raise ValueError(" ".join(msg)) elif Z.index.difference(self._Z.index).shape[0] == 0: is_contained_by_fitted_z = True elif first_idx > orig_last_idx: is_future = True pad_z_inv = self.drop_na or is_future cutoff = Z.index[0] if pad_z_inv else Z.index[ self._cumulative_lags[-1]] fh = ForecastingHorizon( np.arange(-1, -(self._cumulative_lags[-1] + 1), -1)) index = fh.to_absolute(cutoff).to_pandas() index_diff = index.difference(self._Z.index) if index_diff.shape[0] != 0 and not is_contained_by_fitted_z: msg = [ f"Inverse transform requires indices {index}", "to have been stored in `fit()`,", f"but the indices {index_diff} were not found.", ] raise ValueError(" ".join(msg)) return is_contained_by_fitted_z, pad_z_inv
def test_cutoff_window_splitter(y, cutoffs, fh, window_length): """Test CutoffSplitter.""" cv = CutoffSplitter(cutoffs, fh=fh, window_length=window_length) if _cutoffs_fh_window_length_types_are_supported( cutoffs=cutoffs, fh=ForecastingHorizon(fh), window_length=window_length): train_windows, test_windows, cutoffs, n_splits = _check_cv(cv, y) np.testing.assert_array_equal(cutoffs, cv.get_cutoffs(y)) else: match = "Unsupported combination of types" with pytest.raises(TypeError, match=match): _check_cv(cv, y)
def test_reductions_airline_data(forecaster, expected): """ test reduction forecasters by making prediction on airline dataset using linear estimators. predictions compared with values calculated by Lovkush Agarwal on their local machine in Mar 2021 """ y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) actual = forecaster.fit(y_train, fh=fh).predict(fh) np.testing.assert_almost_equal(actual, expected)
def predict(Xpred=None, data_pars={}, compute_pars={}, out_pars={}, **kw): global model, session if Xpred is None: data_pars['train'] = False Xpred = get_dataset(data_pars, task_type="predict") Xpred_fh = ForecastingHorizon(Xpred.index, is_relative=False) ypred = model.model.predict(Xpred_fh) ypred_proba = None ### No proba return ypred, ypred_proba
def test_factory_method_direct(): y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) regressor = LinearRegression() f1 = ReducedForecaster(regressor, scitype="regressor", strategy="direct") f2 = DirectRegressionForecaster(regressor) actual = f1.fit(y_train, fh=fh).predict(fh) expected = f2.fit(y_train, fh=fh).predict(fh) np.testing.assert_array_equal(actual, expected)
def _fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional, default=None The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional, default=None Exogenous variables are ignored. Returns ------- self : returns an instance of self. """ _, forecasters = self._check_forecasters() self.regressor_ = check_regressor(regressor=self.regressor, random_state=self.random_state) # get training data for meta-model if X is not None: y_train, y_test, X_train, X_test = temporal_train_test_split( y, X, test_size=self.test_size) else: y_train, y_test = temporal_train_test_split( y, test_size=self.test_size) X_train, X_test = None, None # fit ensemble models fh_regressor = ForecastingHorizon(y_test.index, is_relative=False) self._fit_forecasters(forecasters, y_train, X_train, fh_regressor) X_meta = pd.concat(self._predict_forecasters(fh_regressor, X_test), axis=1) # fit meta-model (regressor) on predictions of ensemble models # with y_test as endog/target self.regressor_.fit(X=X_meta, y=y_test) # check if regressor is a sklearn.Pipeline if isinstance(self.regressor_, Pipeline): # extract regressor from pipeline to access its attributes self.weights_ = _get_weights(self.regressor_.steps[-1][1]) else: self.weights_ = _get_weights(self.regressor_) # fit forecasters with all data self._fit_forecasters(forecasters, y, X, fh) return self
def test_factory_method_ts_direct(): y = load_airline() y_train, y_test = temporal_train_test_split(y, test_size=24) fh = ForecastingHorizon(y_test.index, is_relative=False) ts_regressor = Pipeline([("tabularize", Tabularizer()), ("model", LinearRegression())]) f1 = ReducedForecaster(ts_regressor, scitype="ts_regressor", strategy="direct") f2 = DirectTimeSeriesRegressionForecaster(ts_regressor) actual = f1.fit(y_train, fh=fh).predict(fh) expected = f2.fit(y_train, fh=fh).predict(fh) np.testing.assert_array_equal(actual, expected)
def _get_end(y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon) -> int: """Compute the end of the last training window for a forecasting horizon.""" # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by # `check_window_length`. n_timepoints = y.shape[0] # For purely in-sample forecasting horizons, the last split point is the end of the # training data. if fh.is_all_in_sample(): end = n_timepoints + 1 # Otherwise, the last point must ensure that the last horizon is within the data. else: fh_max = fh[-1] end = n_timepoints - fh_max + 1 return end