def _split(self, y): step_length = check_step_length(self.step_length) window_length = check_window_length(self.window_length, "window_length") initial_window = check_window_length(self.initial_window, "initial_window") fh = _check_fh(self.fh) _check_window_lengths(y, fh, window_length, initial_window) if self.initial_window is not None: if not self.start_with_window: raise ValueError( "`start_with_window` must be True if `initial_window` is given" ) if self.initial_window <= self.window_length: raise ValueError("`initial_window` must greater than `window_length`") # For in-sample forecasting horizons, the first split must ensure that # in-sample test set is still within the data. if not fh.is_all_out_of_sample() and abs(fh[0]) >= self.initial_window: initial_start = abs(fh[0]) - self.initial_window + 1 else: initial_start = 0 initial_end = initial_start + initial_window train = np.arange(initial_start, initial_end) test = initial_end + fh.to_numpy() - 1 yield train, test start = self._get_start(fh) end = _get_end(y, fh) for train, test in self._split_windows( start, end, step_length, window_length, fh.to_numpy() ): yield train, test
def _fit(self, X, y=None): """Fit transformer, generating random interval indices. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_features] each cell of X must contain pandas.Series Data to fit transform to y : any container with method shape, optional, default=None y.shape[0] determines n_timepoints, 1 if None Returns ------- self : RandomIntervalSegmenter This estimator """ if y is not None: n_timepoints = y.shape[0] else: n_timepoints = 1 self.min_length = check_window_length(self.min_length, n_timepoints, "min_length") self.max_length = check_window_length(self.max_length, n_timepoints, "max_length") if self.min_length is None: min_length = 2 else: min_length = self.min_length if self.max_length is not None: if not min_length < self.max_length: raise ValueError( "`max_length` must be bigger than `min_length`.") self.input_shape_ = X.shape # Retrieve time-series indexes from each column. # TODO generalise to columns with series of unequal length self._time_index = _get_time_index(X) # Compute random intervals for each column. # TODO if multiple columns are passed, introduce option to compute # one set of shared intervals, # or rely on ColumnTransformer? if self.n_intervals == "random": if self.min_length is not None or self.max_length is not None: raise ValueError( "Setting `min_length` or `max_length` is not yet " "implemented for `n_intervals='random'`.") self.intervals_ = _rand_intervals_rand_n( self._time_index, random_state=self.random_state) else: self.intervals_ = _rand_intervals_fixed_n( self._time_index, n_intervals=self.n_intervals, min_length=min_length, max_length=self.max_length, random_state=self.random_state, ) return self
def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] step_length = check_step_length(self.step_length) window_length = check_window_length( window_length=self.window_length, n_timepoints=n_timepoints, name="window_length", ) initial_window = check_window_length( window_length=self.initial_window, n_timepoints=n_timepoints, name="initial_window", ) fh = _check_fh(self.fh) _check_window_lengths( y=y, fh=fh, window_length=window_length, initial_window=initial_window ) if self.initial_window is not None: yield self._split_for_initial_window(y) start = self._get_start(y=y, fh=fh) end = _get_end(y_index=y, fh=fh) + 2 step_length = self._get_step_length(x=step_length) for train, test in self._split_windows( start=start, end=end, step_length=step_length, window_length=window_length, y=y, fh=fh.to_numpy(), ): yield train, test
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ check_window_length(self.min_length, "`min_length`") check_window_length(self.max_length, "`max_length`") if self.min_length is None: min_length = 2 else: min_length = self.min_length if self.max_length is not None: if not min_length < self.max_length: raise ValueError("`max_length` must be bigger than `min_length`.") X = check_X(X, enforce_univariate=True) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. # TODO generalise to columns with series of unequal length self._time_index = _get_time_index(X) # Compute random intervals for each column. # TODO if multiple columns are passed, introduce option to compute # one set of shared intervals, # or rely on ColumnTransformer? if self.n_intervals == "random": if self.min_length is not None or self.max_length is not None: raise ValueError( "Setting `min_length` or `max_length` is not yet " "implemented for `n_intervals='random'`." ) self.intervals_ = _rand_intervals_rand_n( self._time_index, random_state=self.random_state ) else: self.intervals_ = _rand_intervals_fixed_n( self._time_index, n_intervals=self.n_intervals, min_length=min_length, max_length=self.max_length, random_state=self.random_state, ) self._is_fitted = True return self
def _split(self, y: Optional[ACCEPTED_Y_TYPES]) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] step_length = check_step_length(self.step_length) window_length = check_window_length(self.window_length, n_timepoints, "window_length") initial_window = check_window_length(self.initial_window, n_timepoints, "initial_window") fh = _check_fh(self.fh) _check_window_lengths(y, fh, window_length, initial_window) if self.initial_window is not None: if not self.start_with_window: raise ValueError( "`start_with_window` must be True if `initial_window` is given" ) if self.initial_window <= self.window_length: raise ValueError( "`initial_window` must greater than `window_length`") if is_timedelta_or_date_offset(x=self.initial_window): initial_window_threshold = y.get_loc(y[0] + self.initial_window) else: initial_window_threshold = self.initial_window # For in-sample forecasting horizons, the first split must ensure that # in-sample test set is still within the data. if not fh.is_all_out_of_sample() and abs( fh[0]) >= initial_window_threshold: initial_start = abs(fh[0]) - self.initial_window + 1 else: initial_start = 0 if is_timedelta_or_date_offset(x=initial_window): initial_end = y.get_loc(y[initial_start] + initial_window) else: initial_end = initial_start + initial_window train = np.arange(initial_start, initial_end) test = initial_end + fh.to_numpy() - 1 yield train, test start = self._get_start(y=y, fh=fh) end = _get_end(y=y, fh=fh) step_length = self._get_step_length(x=step_length) for train, test in self._split_windows(start, end, step_length, window_length, y, fh.to_numpy()): yield train, test
def _fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. Returns ------- self : returns an instance of self. """ # We currently only support out-of-sample predictions. For the direct # strategy, we need to check this at the beginning of fit, as the fh is # required for fitting. if not self.fh.is_all_out_of_sample(self.cutoff): raise NotImplementedError( "In-sample predictions are not implemented.") self.window_length_ = check_window_length(self.window_length, n_timepoints=len(y)) yt, Xt = self._transform(y, X) # Fit a multi-output estimator to the transformed data. self.estimator_ = clone(self.estimator) self.estimator_.fit(Xt, yt) return self
def split_initial(self, y): """Split initial window This is useful during forecasting model selection where we want to fit the forecaster on some part of the data first before doing temporal cross-validation Parameters ---------- y : pd.Series Returns ------- intial_training_window : np.array initial_test_window : np.array """ if self.initial_window is None: raise ValueError( "Please specify initial window, found: `initial_window`=None" ) initial = check_window_length(self.initial_window) initial_training_window = np.arange(initial) initial_test_window = np.arange(initial, len(y)) return initial_training_window, initial_test_window
def fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. Returns ------- self : returns an instance of self. """ n_timepoints = len(y) self._set_y_X(y, X) self._set_fh(fh) self.step_length_ = check_step_length(self.step_length) self.window_length_ = check_window_length(self.window_length, n_timepoints) self._fit(y, X) self._is_fitted = True return self
def _fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. Returns ------- self : returns an instance of self. """ self.window_length_ = check_window_length(self.window_length, n_timepoints=len(y)) yt, Xt = self._transform(y, X) # Make sure yt is 1d array to avoid DataConversion warning from scikit-learn. yt = yt.ravel() self.estimator_ = clone(self.estimator) self.estimator_.fit(Xt, yt) return self
def _split_windows(self, y): window_length = check_window_length(self.window_length) fh = self._check_fh() end = self._get_end(y) - 1 start = 0 if window_length is None else end - window_length training_window = np.arange(start, end) test_window = end + fh - 1 yield training_window, test_window
def _split(self, y): window_length = check_window_length(self.window_length) fh = _check_fh(self.fh) end = _get_end(y, fh) - 1 start = 0 if window_length is None else end - window_length train = np.arange(start, end) test = end + fh.to_numpy() - 1 yield train, test
def _split_windows(self, y): step_length = check_step_length(self.step_length) window_length = check_window_length(self.window_length) fh = self._check_fh() end = self._get_end(y) start = self._get_start() for split_point in range(start, end, step_length): training_window = np.arange(split_point - window_length, split_point) test_window = split_point + fh - 1 yield training_window, test_window
def fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored Returns ------- self : returns an instance of self. """ self._set_y_X(y, X) if X is not None: raise NotImplementedError( "Exogenous variables `X` are not yet supported.") self._set_fh(fh) if len(self.fh.to_in_sample(self.cutoff)) > 0: raise NotImplementedError( "In-sample predictions are not implemented") self.step_length_ = check_step_length(self.step_length) self.window_length_ = check_window_length(self.window_length) # for the direct reduction strategy, a separate forecaster is fitted # for each step ahead of the forecasting horizon self._cv = SlidingWindowSplitter( fh=self.fh.to_relative(self.cutoff), window_length=self.window_length_, step_length=self.step_length_, start_with_window=True, ) # transform data using rolling window split X, Y_train = self._transform(y, X) # iterate over forecasting horizon self.regressors_ = [] for i in range(len(self.fh)): y = Y_train[:, i] regressor = clone(self.regressor) regressor.fit(X, y) self.regressors_.append(regressor) self._is_fitted = True return self
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] window_length = check_window_length(self.window_length, n_timepoints) fh = _check_fh(self.fh) end = _get_end(y, fh) - 1 if window_length is None: start = 0 elif is_timedelta_or_date_offset(x=window_length): start = y.get_loc(y[end - 1] - window_length) + 1 else: start = end - window_length train = np.arange(start, end) test = end + fh.to_numpy() - 1 yield train, test
def _split_windows(self, y): # cutoffs cutoffs = check_cutoffs(self.cutoffs) if not np.max(cutoffs) < len(y): raise ValueError("`cutoffs` are out-of-bounds for given `y`.") fh = self._check_fh() if np.max(cutoffs) + np.max(fh) > len(y): raise ValueError("`fh` is out-of-bounds for given `cutoffs` and `y`.") window_length = check_window_length(self.window_length) for cutoff in cutoffs: training_window = np.arange(cutoff - window_length, cutoff) + 1 test_window = cutoff + fh yield training_window, test_window
def _split(self, y): # cutoffs cutoffs = check_cutoffs(self.cutoffs) if np.max(cutoffs) >= y.shape[0]: raise ValueError("`cutoffs` are incompatible with given `y`.") fh = _check_fh(self.fh) if np.max(cutoffs) + np.max(fh) > y.shape[0]: raise ValueError("`fh` is incompatible with given `cutoffs` and `y`.") window_length = check_window_length(self.window_length) for cutoff in cutoffs: training_window = np.arange(cutoff - window_length, cutoff) + 1 test_window = cutoff + fh yield training_window, test_window
def fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored Returns ------- self : returns an instance of self. """ # input checks if X is not None: raise NotImplementedError( "Exogenous variables `X` are not yet supported.") # set values self._set_y_X(y, X) self._set_fh(fh) self.step_length_ = check_step_length(self.step_length) self.window_length_ = check_window_length(self.window_length) # set up cv iterator, for recursive strategy, a single estimator # is fit for a one-step-ahead forecasting horizon and then called # iteratively to predict multiple steps ahead self._cv = SlidingWindowSplitter( fh=1, window_length=self.window_length_, step_length=self.step_length_, start_with_window=True, ) # transform data into tabular form X_train_tab, y_train_tab = self._transform(y, X) # fit base regressor regressor = clone(self.regressor) regressor.fit(X_train_tab, y_train_tab.ravel()) self.regressor_ = regressor self._is_fitted = True return self
def fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored Returns ------- self : returns an instance of self. """ self._set_y_X(y, X) if X is not None: raise NotImplementedError( "Exogenous variables `X` are not yet supported.") self._set_fh(fh) if len(self.fh.to_in_sample(self.cutoff)) > 0: raise NotImplementedError( "In-sample predictions are not implemented") self.step_length_ = check_step_length(self.step_length) self.window_length_ = check_window_length(self.window_length) # for the multioutput reduction strategy, a single forecaster is fitted # simultaneously to all the future steps in the forecasting horizon # by reducing to a forecaster that can handle multi-dimensional outputs self._cv = SlidingWindowSplitter( fh=self.fh.to_relative(self.cutoff), window_length=self.window_length_, step_length=self.step_length_, start_with_window=True, ) # transform data using rolling window split X, Y_train = self._transform(y, X) # fit regressor to training data regressor = clone(self.regressor) regressor.fit(X, Y_train) self.regressor_ = regressor self._is_fitted = True return self
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: cutoffs = check_cutoffs(self.cutoffs) if np.max(cutoffs) >= y.shape[0]: raise ValueError("`cutoffs` are incompatible with given `y`.") fh = _check_fh(self.fh) n_timepoints = y.shape[0] if np.max(cutoffs) + np.max(fh) > y.shape[0]: raise ValueError("`fh` is incompatible with given `cutoffs` and `y`.") window_length = check_window_length(self.window_length, n_timepoints) for cutoff in cutoffs: if is_timedelta_or_date_offset(x=window_length): train_start = y.get_loc(max(y[0], y[cutoff] - window_length)) else: train_start = cutoff - window_length training_window = np.arange(train_start, cutoff) + 1 test_window = cutoff + fh yield training_window, test_window
def _get_end(self, y): """Helper function to compute the end of the last window""" n_timepoints = len(y) fh = self._check_fh() window_length = check_window_length(self.window_length) # end point is end of last window is_in_sample = np.all(fh <= 0) if is_in_sample: end = n_timepoints + 1 else: fh_max = fh[-1] end = n_timepoints - fh_max + 1 # non-inclusive end indexing # check if computed values are feasible with the provided index if window_length is not None: if window_length + fh_max > n_timepoints: raise ValueError( "The window length and forecasting horizon are " "incompatible with the length of `y`") return end
def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] window_length = check_window_length(self.window_length, n_timepoints) fh = _check_fh(self.fh) end = _get_end(y_index=y, fh=fh) if window_length is None: start = 0 elif is_int(window_length): start = end - window_length + 1 else: start = np.argwhere(y > y[end] - window_length).flatten()[0] train = self._get_train_window(y=y, train_start=start, split_point=end + 1) if array_is_int(fh): test = end + fh.to_numpy() else: test = np.array([y.get_loc(y[end] + x) for x in fh.to_pandas()]) yield train, test
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] cutoffs = check_cutoffs(cutoffs=self.cutoffs) fh = _check_fh(fh=self.fh) window_length = check_window_length(window_length=self.window_length, n_timepoints=n_timepoints) _check_cutoffs_fh_window_length(cutoffs=cutoffs, fh=fh, window_length=window_length) _check_cutoffs_and_y(cutoffs=cutoffs, y=y) _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y) max_fh = fh.max() max_cutoff = np.max(cutoffs) for cutoff in cutoffs: if is_int(x=window_length) and is_int(x=cutoff): train_start = cutoff - window_length elif is_timedelta_or_date_offset(x=window_length) and is_datetime( x=cutoff): train_start = y.get_loc(max(y[0], cutoff - window_length)) else: raise TypeError(f"Unsupported combination of types: " f"`window_length`: {type(window_length)}, " f"`cutoff`: {type(cutoff)}") if is_int(x=cutoff): training_window = np.arange(train_start, cutoff) + 1 else: training_window = np.arange(train_start, y.get_loc(cutoff)) + 1 test_window = cutoff + fh.to_numpy() if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh): test_window = test_window[test_window >= y.min()] test_window = np.array( [y.get_loc(timestamp) for timestamp in test_window]) yield training_window, test_window
def _fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, default=None The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, default=None Exogenous variables are ignored. Returns ------- self : returns an instance of self. """ # X_train is ignored n_timepoints = y.shape[0] if self.strategy in ("last", "mean"): # check window length is greater than sp for seasonal mean or seasonal last if self.window_length is not None and self.sp != 1: if self.window_length < self.sp: raise ValueError(f"The `window_length`: " f"{self.window_length} is smaller than " f"`sp`: {self.sp}.") self.window_length_ = check_window_length(self.window_length, n_timepoints) self.sp_ = check_sp(self.sp) # if not given, set default window length if self.window_length is None: self.window_length_ = len(y) elif self.strategy == "drift": if self.sp != 1: warn( "For the `drift` strategy, the `sp` value will be ignored." ) # window length we need for forecasts is just the # length of seasonal periodicity self.window_length_ = check_window_length(self.window_length, n_timepoints) if self.window_length is None: self.window_length_ = len(y) if self.window_length == 1: raise ValueError(f"For the `drift` strategy, " f"the `window_length`: {self.window_length} " f"value must be greater than one.") else: allowed_strategies = ("last", "mean", "drift") raise ValueError(f"Unknown strategy: {self.strategy}. Expected " f"one of: {allowed_strategies}.") # check window length if self.window_length_ > len(self._y): param = ("sp" if self.strategy == "last" and self.sp != 1 else "window_length_") raise ValueError( f"The {param}: {self.window_length_} is larger than " f"the training series.") return self
def _sliding_window_transform(y, window_length, fh, X=None, scitype="tabular-regressor"): """Transform time series data `y` and `X` using sliding window. See `test_sliding_window_transform_explicit` in test_reduce.py for explicit example. Parameters ---------- y : pd.Series Endogenous time series window_length : int Window length for transformed feature variables fh : ForecastingHorizon Forecasting horizon for transformed target variable X : pd.DataFrame, optional (default=None) Exogenous series. scitype : str {"tabular-regressor", "time-series-regressor"}, optional Scitype of estimator to use with transformed data. - If "tabular-regressor", returns X as tabular 2d array - If "time-series-regressor", returns X as panel 3d array Returns ------- yt : np.ndarray, shape = (n_timepoints - window_length, 1) Transformed target variable. Xt : np.ndarray, shape = (n_timepoints - window_length, n_variables, window_length) Transformed lagged values of target variable and exogenous variables, excluding contemporaneous values. """ # There are different ways to implement this transform. Pre-allocating an # array and filling it by iterating over the window length seems to be the most # efficient one. n_timepoints = y.shape[0] window_length = check_window_length(window_length, n_timepoints) z = _concat_y_X(y, X) n_timepoints, n_variables = z.shape fh = _check_fh(fh) fh_max = fh[-1] if window_length + fh_max >= n_timepoints: raise ValueError( "The `window_length` and `fh` are incompatible with the length of `y`" ) # Get the effective window length accounting for the forecasting horizon. effective_window_length = window_length + fh_max # Pre-allocate array for sliding windows. Zt = np.zeros(( n_timepoints + effective_window_length, n_variables, effective_window_length + 1, )) # Transform data. for k in range(effective_window_length + 1): i = effective_window_length - k j = n_timepoints + effective_window_length - k Zt[i:j, :, k] = z # Truncate data, selecting only full windows, discarding incomplete ones. Zt = Zt[effective_window_length:-effective_window_length] # Return transformed feature and target variables separately. This excludes # contemporaneous values of the exogenous variables. Including them would lead to # unequal-length data, with more time points for exogenous series than the target # series, which is currently not supported. yt = Zt[:, 0, window_length + fh] Xt = Zt[:, :, :window_length] # If the scitype is tabular regression, we have to convert X into a 2d array. if scitype == "tabular-regressor": return yt, Xt.reshape(Xt.shape[0], -1) else: return yt, Xt
def test_check_window_length(window_length, n_timepoints, expected): assert check_window_length(window_length, n_timepoints) == expected
def test_window_length_bad_arg(window_length, n_timepoints): with pytest.raises(ValueError): check_window_length(window_length, n_timepoints)
def _get_start(self): window_length = check_window_length(self.window_length) if self.start_with_window: return window_length else: return 0
def _fit(self, y, X=None, fh=None): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. Returns ------- self : Estimator An fitted instance of self. """ # Exogenous variables are not yet supported for the dirrec strategy. if X is not None: raise NotImplementedError( f"{self.__class__.__name__} does not yet support exogenous " f"variables `X`.") if len(self.fh.to_in_sample(self.cutoff)) > 0: raise NotImplementedError( "In-sample predictions are not implemented") self.window_length_ = check_window_length(self.window_length, n_timepoints=len(y)) # Transform the data using sliding-window. yt, Xt = self._transform(y, X) # We cast the 2d tabular array into a 3d panel array to handle the data # consistently for the reduction to tabular and time-series regression. if self._estimator_scitype == "tabular-regressor": Xt = np.expand_dims(Xt, axis=1) # This only works without exogenous variables. To support exogenous # variables, we need additional values for X to fill the array # appropriately. X_full = np.concatenate([Xt, np.expand_dims(yt, axis=1)], axis=2) self.estimators_ = [] n_timepoints = Xt.shape[2] for i in range(len(self.fh)): estimator = clone(self.estimator) # Slice data using expanding window. X_fit = X_full[:, :, :n_timepoints + i] # Convert to 2d tabular array for reduction to tabular regression. if self._estimator_scitype == "tabular-regressor": X_fit = X_fit.reshape(X_fit.shape[0], -1) estimator.fit(X_fit, yt[:, i]) self.estimators_.append(estimator) self._is_fitted = True return self