def check_sp(sp, enforce_list=False): """Validate seasonal periodicity. Parameters ---------- sp : int or [int/float] Seasonal periodicity emforce_list : bool, optional (default=False) If true, convert sp to list if not list. Returns ------- sp : int or [int/float] Validated seasonal periodicity """ if sp is not None: if enforce_list and is_int(sp) and sp >= 1: sp = [sp] elif (enforce_list and isinstance(sp, list)) or (is_int(sp) and sp >= 1): pass else: if enforce_list: raise ValueError("`sp` must be an int >= 1, [float/int] or None") else: raise ValueError("`sp` must be an int >= 1 or None") return sp
def check_fh_values(values): """Validate forecasting horizon values. Parameters ---------- values : int, list of int, array of int Forecasting horizon with steps ahead to predict. Raises ------ TypeError : if values do not meet criteria Returns ------- fh : numpy array of int Sorted and validated forecasting horizon. """ # check single integer if is_int(values): values = np.array([values], dtype=np.int) # check array elif isinstance(values, np.ndarray): if values.ndim > 1: raise TypeError(f"`fh` must be a 1d array, but found shape: " f"{values.shape}") if not np.issubdtype(values.dtype, np.integer): raise TypeError(f"If `fh` is passed as an array, it must " f"be an array of integers, but found an " f"array of type: {values.dtype}") # check list elif isinstance(values, list): if not np.all([is_int(h) for h in values]): raise TypeError("If `fh` is passed as a list, " "it has to be a list of integers.") values = np.array(values, dtype=np.int) else: raise TypeError(f"`fh` has to be either a numpy array, list, " f"or a single integer, but found: {type(values)}") # check fh is not empty if len(values) < 1: raise TypeError("`fh` cannot be empty, please specify now least one " "step to forecast.") # check fh does not contain duplicates if len(values) != len(np.unique(values)): raise TypeError("`fh` should not contain duplicates.") # sort fh return np.sort(values)
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ X = check_X(X, enforce_univariate=True) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. self._time_index = get_time_index(X) if isinstance(self.intervals, np.ndarray): self.intervals_ = self.intervals elif is_int(self.intervals): self.intervals_ = np.array_split(self._time_index, self.intervals) else: raise ValueError( f"Intervals must be either an integer, an array with " f"start and end points, but found: {self.intervals}") self._is_fitted = True return self
def _cutoffs_fh_window_length_types_are_supported( cutoffs: VALID_CUTOFF_TYPES, fh: FORECASTING_HORIZON_TYPES, window_length: ACCEPTED_WINDOW_LENGTH_TYPES, ) -> bool: """Check that combination of inputs is supported. Currently, only two cases are allowed: either all inputs are integers, or they are all datetime or timedelta Parameters ---------- cutoffs : np.array or pd.Index cutoff points, positive and integer- or datetime-index like fh : int, timedelta, list or np.array of ints or timedeltas window_length : int or timedelta or pd.DateOffset Returns ------- True if all inputs are compatible, False otherwise """ all_int = array_is_int(cutoffs) and array_is_int(fh) and is_int( window_length) all_dates = (array_is_datetime64(cutoffs) and array_is_timedelta_or_date_offset(fh) and is_timedelta_or_date_offset(window_length)) if all_int or all_dates: return True else: return False
def check_window_length(window_length): """Validate window length""" if window_length is not None: if not is_int(window_length) or window_length < 1: raise ValueError( f"`window_length_` must be a positive integer >= 1 or None, " f"but found: {window_length}") return window_length
def check_step_length(step_length): """Validate window length""" if step_length is not None: if not is_int(step_length) or step_length < 1: raise ValueError( f"`step_length` must be a positive integer >= 1 or None, " f"but found: {step_length}") return step_length
def check_window_properties(windows, allow_empty=False): """Helper function to test common properties of windows""" assert isinstance(windows, list) for window in windows: assert isinstance(window, np.ndarray) assert all(is_int(idx) for idx in window) assert window.ndim == 1 if not allow_empty: assert len(window) > 0
def _guerrero(x, sp, bounds=None): r""" Returns lambda estimated by the Guerrero method [Guerrero]. Parameters ---------- x : ndarray Input array. Must be 1-dimensional. sp : integer Seasonal periodicity value. Must be an integer >= 2 bounds : {None, (float, float)}, optional Bounds on lambda to be used in minimization. Returns ------- lambda : float Lambda value that minimizes the coefficient of variation of variances of the time series in different periods after Box-Cox transformation [Guerrero]. References ---------- [Guerrero] V.M. Guerrero, "Time-series analysis supported by Power Transformations ", Journal of Forecasting, Vol. 12, 37-48 (1993) https://doi.org/10.1002/for.3980120104 """ if sp is None or not is_int(sp) or sp < 2: raise ValueError( "Guerrero method requires an integer seasonal periodicity (sp) value >= 2." ) x = np.asarray(x) if x.ndim != 1: raise ValueError("Data must be 1-dimensional.") num_obs = len(x) len_prefix = num_obs % sp x_trimmed = x[len_prefix:] x_mat = x_trimmed.reshape((-1, sp)) x_mean = np.mean(x_mat, axis=1) # [Guerrero, Eq.(5)] uses an unbiased estimation for # the standard deviation x_std = np.std(x_mat, axis=1, ddof=1) def _eval_guerrero(lmb, x_std, x_mean): x_ratio = x_std / x_mean**(1 - lmb) x_ratio_cv = variation(x_ratio) return x_ratio_cv optimizer = _make_boxcox_optimizer(bounds) return optimizer(_eval_guerrero, args=(x_std, x_mean))
def _check_cutoffs_against_test_windows(cutoffs, windows, fh, y): # We check for the last value. Some windows may be incomplete, with no first # value, whereas the last value will always be there. fh = check_fh(fh) if is_int(fh[-1]): expected = np.array([window[-1] - fh[-1] for window in windows]) elif array_is_timedelta_or_date_offset(fh): expected = np.array([(y.index[window[-1]] - fh[-1]).to_datetime64() for window in windows]) else: raise ValueError( f"Provided `fh` type is not supported: {type(fh[-1])}") np.testing.assert_array_equal(cutoffs, expected)
def _check_cutoffs_fh_y(cutoffs: VALID_CUTOFF_TYPES, fh: FORECASTING_HORIZON_TYPES, y: ACCEPTED_Y_TYPES) -> None: """Check that combination of inputs is compatible. Currently, only two cases are allowed: either both `cutoffs` and `fh` are integers, or they are datetime or timedelta. Parameters ---------- cutoffs : np.array or pd.Index Cutoff points, positive and integer- or datetime-index like. Type should match the type of `fh` input. fh : int, timedelta, list or np.array of ints or timedeltas Type should match the type of `cutoffs` input. y : pd.Series, pd.DataFrame, np.ndarray, or pd.Index coerced and checked version of input y Raises ------ ValueError if max cutoff plus max `fh` is above the last observation in `y` TypeError if `cutoffs` and `fh` type combination is not supported """ max_cutoff = np.max(cutoffs) max_fh = fh.max() msg = "`fh` is incompatible with given `cutoffs` and `y`." if is_int(x=max_cutoff) and is_int(x=max_fh): if max_cutoff + max_fh > y.shape[0]: raise ValueError(msg) elif is_datetime(x=max_cutoff) and is_timedelta(x=max_fh): if max_cutoff + max_fh > y.max(): raise ValueError(msg) else: raise TypeError("Unsupported type of `cutoffs` and `fh`")
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] cutoffs = check_cutoffs(cutoffs=self.cutoffs) fh = _check_fh(fh=self.fh) window_length = check_window_length(window_length=self.window_length, n_timepoints=n_timepoints) _check_cutoffs_fh_window_length(cutoffs=cutoffs, fh=fh, window_length=window_length) _check_cutoffs_and_y(cutoffs=cutoffs, y=y) _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y) max_fh = fh.max() max_cutoff = np.max(cutoffs) for cutoff in cutoffs: if is_int(x=window_length) and is_int(x=cutoff): train_start = cutoff - window_length elif is_timedelta_or_date_offset(x=window_length) and is_datetime( x=cutoff): train_start = y.get_loc(max(y[0], cutoff - window_length)) else: raise TypeError(f"Unsupported combination of types: " f"`window_length`: {type(window_length)}, " f"`cutoff`: {type(cutoff)}") if is_int(x=cutoff): training_window = np.arange(train_start, cutoff) + 1 else: training_window = np.arange(train_start, y.get_loc(cutoff)) + 1 test_window = cutoff + fh.to_numpy() if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh): test_window = test_window[test_window >= y.min()] test_window = np.array( [y.get_loc(timestamp) for timestamp in test_window]) yield training_window, test_window
def check_cutoffs(cutoffs): if not isinstance(cutoffs, np.ndarray): raise ValueError( f"`cutoffs` must be a np.array, but found: {type(cutoffs)}") if not all([is_int(cutoff) for cutoff in cutoffs]): raise ValueError("All cutoff points must be integers") if not cutoffs.ndim == 1: raise ValueError("`cutoffs must be 1-dimensional array") if not len(cutoffs) > 0: raise ValueError("Found empty `cutoff` array") return np.sort(cutoffs)
def check_sp(sp): """Validate seasonal periodicity. Parameters ---------- sp : int Seasonal periodicity Returns ------- sp : int Validated seasonal periodicity """ if sp is not None and (not is_int(sp) or sp < 1): raise ValueError("`sp` must be a positive integer >= 1 or None") return sp
def check_step_length(step_length) -> Optional[int]: """Validate window length. Parameters ---------- step_length : step length for data set. Returns ------- step_length : int if step_length in not none and is int and greater than or equal to 1. Raises ------ ValueError if step_length is negative or not an integer or is None. """ if step_length is None: return None elif is_int(step_length): if step_length < 1: raise ValueError(f"`step_length` must be a integer >= 1, " f"but found: {step_length}") else: return step_length elif is_timedelta(step_length): if step_length <= timedelta(0): raise ValueError(f"`step_length` must be a positive timedelta, " f"but found: {step_length}") else: return step_length elif is_date_offset(step_length): if step_length + pd.Timestamp(0) <= pd.Timestamp(0): raise ValueError( f"`step_length` must be a positive pd.DateOffset, " f"but found: {step_length}") else: return step_length else: raise ValueError( f"`step_length` must be an integer, timedelta, pd.DateOffset, or None, " f"but found: {type(step_length)}")
def _check_lags(lags): msg = " ".join([ "`lags` should be provided as a positive integer scaler, or", "a list, tuple or np.ndarray of positive integers," f"but found {type(lags)}.", ]) non_positive_msg = "`lags` should be positive integers." if isinstance(lags, int): if lags <= 0: raise ValueError(non_positive_msg) lags = check_array([lags], ensure_2d=False) elif isinstance(lags, (list, tuple, np.ndarray)): if not all([is_int(lag) for lag in lags]): raise TypeError(msg) lags = check_array(lags, ensure_2d=False) if (lags <= 0).any(): raise ValueError(non_positive_msg) else: raise TypeError(msg) return lags
def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] window_length = check_window_length(self.window_length, n_timepoints) fh = _check_fh(self.fh) end = _get_end(y_index=y, fh=fh) if window_length is None: start = 0 elif is_int(window_length): start = end - window_length + 1 else: start = np.argwhere(y > y[end] - window_length).flatten()[0] train = self._get_train_window(y=y, train_start=start, split_point=end + 1) if array_is_int(fh): test = end + fh.to_numpy() else: test = np.array([y.get_loc(y[end] + x) for x in fh.to_pandas()]) yield train, test
def check_step_length(step_length): """Validate window length. Parameters ---------- step_length : step length for data set. Returns ---------- step_length : int if step_length in not none and is int and greater than or equal to 1. Raises ---------- ValueError if step_length is negative or not an integer or is None. """ if step_length is not None: if not is_int(step_length) or step_length < 1: raise ValueError( f"`step_length` must be a positive integer >= 1 or None, " f"but found: {step_length}") return step_length
def _check_n_splits(n_splits): assert is_int(n_splits) assert n_splits > 0
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index: """Validate forecasting horizon values. Validation checks validity and also converts forecasting horizon values to supported pandas.Index types if possible. Parameters ---------- values : int, list, array, certain pd.Index types Forecasting horizon with steps ahead to predict. Raises ------ TypeError : Raised if `values` type is not supported Returns ------- values : pd.Index Sorted and validated forecasting horizon values. """ # if values are one of the supported pandas index types, we don't have # to do # anything as the forecasting horizon directly wraps the index, note that # isinstance() does not work here, because index types inherit from each # other, # hence we check for type equality here if type(values) in VALID_INDEX_TYPES: pass # convert single integer to pandas index, no further checks needed elif is_int(values): return pd.Int64Index([values], dtype=int) elif is_timedelta_or_date_offset(values): return pd.Index([values]) # convert np.array or list to pandas index elif is_array(values) and array_is_int(values): values = pd.Int64Index(values, dtype=int) elif is_array(values) and array_is_timedelta_or_date_offset(values): values = pd.Index(values) # otherwise, raise type error else: valid_types = ( "int", "np.array", "list", *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES], ) raise TypeError( f"Invalid `fh`. The type of the passed `fh` values is not supported. " f"Please use one of {valid_types}, but found: {type(values)}") # check values does not contain duplicates if len(values) != values.nunique(): raise ValueError( "Invalid `fh`. The `fh` values must not contain any duplicates.") # return sorted values return values.sort_values()
def check_n_splits_properties(n_splits): """Helper function to test common properties of n_splits""" assert is_int(n_splits) assert n_splits > 0
def check_cutoff_properties(cutoffs): """Helper function to test common properties of cutoffs""" assert isinstance(cutoffs, np.ndarray) assert all(is_int(cutoff) for cutoff in cutoffs) assert cutoffs.ndim == 1 assert len(cutoffs) > 0