def __init__(self, models: Union[List[ForecastingModel], List[GlobalForecastingModel]]): raise_if_not( isinstance(models, list) and models, "Cannot instantiate EnsembleModel with an empty list of models", logger, ) is_local_ensemble = all( isinstance(model, ForecastingModel) and not isinstance(model, GlobalForecastingModel) for model in models) self.is_global_ensemble = all( isinstance(model, GlobalForecastingModel) for model in models) raise_if_not( is_local_ensemble or self.is_global_ensemble, "All models must either be GlobalForecastingModel instances, or none of them should be.", logger, ) raise_if( any([m._fit_called for m in models]), "Cannot instantiate EnsembleModel with trained/fitted models. " "Consider resetting all models with `my_model.untrained_model()`", logger, ) super().__init__() self.models = models self.is_single_series = None
def fill_missing_values(series: TimeSeries, fill: Union[str, float] = 'auto', **interpolate_kwargs) -> TimeSeries: """ Fills missing values in the provided time series Parameters ---------- series The time series for which to fill missing values fill The value used to replace the missing values. If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method. interpolate_kwargs Keyword arguments for `pandas.Dataframe.interpolate()`, only used when fit is set to 'auto'. See `the documentation <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.interpolate.html>`_ for the list of supported parameters. Returns ------- TimeSeries A new TimeSeries with all missing values filled according to the rules above. """ raise_if_not(isinstance(fill, str) or isinstance(fill, float), "`fill` should either be a string or a float", logger) raise_if(isinstance(fill, str) and fill != 'auto', "invalid string for `fill`: can only be set to 'auto'", logger) if fill == 'auto': return _auto_fill(series, **interpolate_kwargs) return _const_fill(series, fill)
def __init__(self, fill: Union[str, float] = 'auto', name: str = "MissingValuesFiller", n_jobs: int = 1, verbose: bool = False): """ Data transformer to fill missing values from a (sequence of) TimeSeries Parameters ---------- fill The value used to replace the missing values. If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method. name A specific name for the transformer n_jobs The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is passed as input to a method, parallelising operations regarding different `TimeSeries`. Defaults to `1` (sequential). Setting the parameter to `-1` means using all the available processors. Note: for a small amount of data, the parallelisation overhead could end up increasing the total required amount of time. verbose Optionally, whether to print operations progress """ raise_if_not( isinstance(fill, str) or isinstance(fill, float), "`fill` should either be a string or a float", logger) raise_if( isinstance(fill, str) and fill != 'auto', "invalid string for `fill`: can only be set to 'auto'", logger) super().__init__(name=name, n_jobs=n_jobs, verbose=verbose) self._fill = fill
def _supports_range_index(self) -> bool: raise_if( self.trend and self.trend != "c", "'trend' is not None. Range indexing is not supported in that case.", logger, ) return True
def ts_fit(series: TimeSeries, lmbda: Optional[Union[float, Sequence[float]]], method, *args, **kwargs) -> Union[Sequence[float], pd.core.series.Series]: component_mask = kwargs.get("component_mask", None) if lmbda is None: # Compute optimal lmbda for each dimension of the time series. In this case, the return type is # an ndarray and not a Sequence vals = BoxCox._reshape_in(series, component_mask=component_mask) lmbda = np.apply_along_axis(boxcox_normmax, axis=0, arr=vals, method=method) elif isinstance(lmbda, Sequence): raise_if( len(lmbda) != series.width, "lmbda should have one value per dimension (ie. column or variable) of the time series", logger, ) else: # Replicate lmbda to match dimensions of the time series lmbda = [lmbda] * series.width return lmbda
def _build_train_dataset( self, target: Sequence[TimeSeries], past_covariates: Optional[Sequence[TimeSeries]], future_covariates: Optional[Sequence[TimeSeries]], max_samples_per_ts: Optional[int], ) -> MixedCovariatesSequentialDataset: raise_if( future_covariates is None and not self.add_relative_index, "TFTModel requires future covariates. The model applies multi-head attention queries on future " "inputs. Consider specifying a future encoder with `add_encoders` or setting `add_relative_index` " "to `True` at model creation (read TFT model docs for more information). " "These will automatically generate `future_covariates` from indexes.", logger, ) return MixedCovariatesSequentialDataset( target_series=target, past_covariates=past_covariates, future_covariates=future_covariates, input_chunk_length=self.input_chunk_length, output_chunk_length=self.output_chunk_length, max_samples_per_ts=max_samples_per_ts, )
def __init__(self, forecasting_models: List[ForecastingModel], regression_train_n_points: int, regression_model=None): """ Class for ensemble models using a regression model for ensembling individual models' predictions. The provided regression model must implement fit() and predict() methods (e.g. scikit-learn regression models). Note that here the regression model is used to learn how to best ensemble the individual forecasting models' forecasts. It is not the same usage of regression as in `RegressionModel`, where the regression model is used to produce forecasts based on the lagged series. Parameters ---------- forecasting_models List of forecasting models whose predictions to ensemble regression_train_n_points The number of points to use to train the regression model regression_model Any regression model with predict() and fit() methods (e.g. from scikit-learn) Default: `darts.model.LinearRegressionModel(fit_intercept=False)` """ super().__init__(forecasting_models) if regression_model is None: regression_model = LinearRegressionModel(lags_exog=0, fit_intercept=False) regression_model = RegressionModel(lags_exog=0, model=regression_model) raise_if( regression_model.lags is not None and regression_model.lags_exog != [0], ("`lags` of regression model must be `None` and `lags_exog` must be [0]. Given: {} and {}" .format(regression_model.lags, regression_model.lags_exog))) self.regression_model = regression_model self.train_n_points = regression_train_n_points
def fit(self, series: TimeSeries): super().fit(series) series._assert_univariate() series = self.training_series if self.version == "tsb": self.forecast_val = self.method( series.values(copy=False), h=1, future_xreg=None, alpha_d=self.alpha_d, alpha_p=self.alpha_p, ) elif self.version == "sba": try: self.forecast_val = self.method(series.values(copy=False), h=1, future_xreg=None) except errors.TypingError: raise_if( True, '"sba" version is not supported with this version of statsforecast.', ) else: self.forecast_val = self.method(series.values(copy=False), h=1, future_xreg=None) return self
def _generate_index( start: Optional[Union[pd.Timestamp, int]] = None, end: Optional[Union[pd.Timestamp, int]] = None, length: Optional[int] = None, freq: str = "D", name: str = None, ) -> Union[pd.DatetimeIndex, pd.RangeIndex]: """Returns an index with a given start point and length. Either a pandas DatetimeIndex with given frequency or a pandas RangeIndex. The index starts at Parameters ---------- start The start of the returned index. If a pandas Timestamp is passed, the index will be a pandas DatetimeIndex. If an integer is passed, the index will be a pandas RangeIndex index. Works only with either `length` or `end`. end Optionally, the end of the returned index. Works only with either `start` or `length`. If `start` is set, `end` must be of same type as `start`. Else, it can be either a pandas Timestamp or an integer. length Optionally, the length of the returned index. Works only with either `start` or `end`. freq The time difference between two adjacent entries in the returned index. Only effective if `start` is a pandas Timestamp. A DateOffset alias is expected; see `docs <https://pandas.pydata.org/pandas-docs/stable/user_guide/TimeSeries.html#dateoffset-objects>`_. The freq is optional for generating an integer index. """ constructors = [ arg_name for arg, arg_name in zip([start, end, length], ["start", "end", "length"]) if arg is not None ] raise_if( len(constructors) != 2, "index can only be generated with exactly two of the following parameters: [`start`, `end`, `length`]. " f"Observed parameters: {constructors}. For generating an index with `end` and `length` consider setting " f"`start` to None.", logger, ) raise_if( end is not None and start is not None and type(start) != type(end), "index generation with `start` and `end` requires equal object types of `start` and `end`", logger, ) if isinstance(start, pd.Timestamp) or isinstance(end, pd.Timestamp): index = pd.date_range(start=start, end=end, periods=length, freq=freq, name=name) else: # int index = pd.RangeIndex( start=start if start is not None else end - length + 1, stop=end + 1 if end is not None else start + length, step=1, name=name, ) return index
def __init__(self, version: str = "classic", alpha_d: float = None, alpha_p: float = None): """An implementation of the `Croston method <https://otexts.com/fpp3/counts.html>`_ for intermittent count series. Relying on the implementation of `Statsforecasts package <https://github.com/Nixtla/statsforecast>`_. Parameters ---------- version - "classic" corresponds to classic Croston. - "optimized" corresponds to optimized classic Croston, which searches for the optimal ``alpha`` smoothing parameter and can take longer to run. Otherwise, a fixed value of ``alpha=0.1`` is used. - "sba" corresponds to the adjustment of the Croston method known as the Syntetos-Boylan Approximation [1]_. - "tsb" corresponds to the adjustment of the Croston method proposed by Teunter, Syntetos and Babai [2]_. In this case, `alpha_d` and `alpha_p` must be set. alpha_d For the "tsb" version, the alpha smoothing parameter to apply on demand. alpha_p For the "tsb" version, the alpha smoothing parameter to apply on probability. References ---------- .. [1] Aris A. Syntetos and John E. Boylan. The accuracy of intermittent demand estimates. International Journal of Forecasting, 21(2):303 – 314, 2005. .. [2] Ruud H. Teunter, Aris A. Syntetos, and M. Zied Babai. Intermittent demand: Linking forecasting to inventory obsolescence. European Journal of Operational Research, 214(3):606 – 615, 2011. """ super().__init__() raise_if_not( version.lower() in ["classic", "optimized", "sba", "tsb"], 'The provided "version" parameter must be set to "classic", "optimized", "sba" or "tsb".', ) if version == "classic": self.method = croston_classic elif version == "optimized": self.method = croston_optimized elif version == "sba": self.method = croston_sba else: raise_if( alpha_d is None or alpha_p is None, 'alpha_d and alpha_p must be specified when using "tsb".', ) self.method = croston_tsb self.alpha_d = alpha_d self.alpha_p = alpha_p self.version = version
def _supports_range_index(self) -> bool: """Prophet does not support integer range index.""" raise_if( True, "Prophet does not support integer range index. The index of the TimeSeries must be of type " "pandas.DatetimeIndex", logger, ) return False
def remove_seasonality( ts: TimeSeries, freq: int = None, model: SeasonalityMode = SeasonalityMode.MULTIPLICATIVE, method: str = "naive", **kwargs, ) -> TimeSeries: """ Adjusts the TimeSeries `ts` for a seasonality of order `frequency` using the `model` decomposition. Parameters ---------- ts The TimeSeries to adjust. freq The seasonality period to use. model The type of decomposition to use. Must be a `from darts import SeasonalityMode` Enum member. Either SeasonalityMode.MULTIPLICATIVE or SeasonalityMode.ADDITIVE. Defaults SeasonalityMode.MULTIPLICATIVE. method The method to be used to decompose the series. - "naive" : Seasonal decomposition using moving averages [1]_. - "STL" : Season-Trend decomposition using LOESS [2]_. Only compatible with ``ADDITIVE`` model type. Defaults to "naive" kwargs Other keyword arguments are passed down to the decomposition method. Returns ------- TimeSeries A new TimeSeries instance that corresponds to the seasonality-adjusted 'ts'. References ------- .. [1] https://www.statsmodels.org/devel/generated/statsmodels.tsa.seasonal.seasonal_decompose.html .. [2] https://www.statsmodels.org/devel/generated/statsmodels.tsa.seasonal.STL.html """ ts._assert_univariate() raise_if_not( model is not SeasonalityMode.NONE, "The model must be either MULTIPLICATIVE or ADDITIVE.", ) raise_if( model not in [SeasonalityMode.ADDITIVE, ModelMode.ADDITIVE] and method == "STL", f"Only ADDITIVE seasonality is compatible with the STL method. Current model is {model}.", logger, ) _, seasonality = extract_trend_and_seasonality(ts, freq, model, method, **kwargs) new_ts = remove_from_series(ts, seasonality, model) return new_ts
def fit(self, training_series: TimeSeries, target_series: Optional[TimeSeries] = None) -> None: super().fit(training_series, target_series) # spare train_n_points points to serve as regression target raise_if( len(self.training_series) <= self.regression_model.train_n_points, "regression_train_n_points parameter too big (greater or equal" " the number of points in training_series)", logger) forecast_training = self.training_series[:-self.regression_model. train_n_points] forecast_target = self.target_series[:-self.regression_model. train_n_points] regression_target = self.target_series[-self.regression_model. train_n_points:] # fit the forecasting models for model in self.models: if isinstance(model, UnivariateForecastingModel): model.fit(forecast_training) else: model.fit(forecast_training, forecast_target) # predict train_n_points points for each model predictions = [] for model in self.models: predictions.append( model.predict(self.regression_model.train_n_points)) # train the regression model on the individual models' predictions self.regression_model.fit(train_features=predictions, train_target=regression_target) # prepare the forecasting models for further predicting by fitting # them with the entire data # Some models (incl. Neural-Network based models) may need to be 'reset' # to allow being retrained from scratch self.models = [ model.untrained_model() if hasattr(model, 'untrained_model') else model for model in self.models ] # fit the forecasting models for model in self.models: if isinstance(model, UnivariateForecastingModel): model.fit(self.training_series) else: model.fit(self.training_series, self.target_series)
def fit(self, data: TimeSeries, lmbda: Optional[Union[float, Sequence[float]]] = None, optim_method='mle') -> 'BoxCox': """ Sets the `lmbda` parameter value. Parameters ---------- data The time series to fit on lmbda If None given, will automatically find an optimal value of lmbda (for each dimension of the time series) using `scipy.stats.boxcox_normmax` with `method=optim_method` If a single float is given, the same lmbda value will be used for all dimensions of the series. Also allows to specify a different lmbda value for each dimension of the time series by passing a sequence of values. optim_method Specifies which method to use to find an optimal value for the lmbda parameter. Either 'mle' or 'pearsonr'. Returns ------- Fitted transformer (self) """ super().fit(data) raise_if( not isinstance(optim_method, str) or optim_method not in ['mle', 'pearsonr'], "optim_method parameter must be either 'mle' or 'pearsonr'", logger) if lmbda is None: # Compute optimal lmbda for each dimension of the time series lmbda = data._df.apply(boxcox_normmax, method=optim_method) elif isinstance(lmbda, Sequence): raise_if( len(lmbda) != data.width, "lmbda should have one value per dimension (ie. column or variable) of the time series", logger) else: # Replicate lmbda to match dimensions of the time series lmbda = [lmbda] * data.width self._lmbda = lmbda return self
def _fit_iterator( self, series: Sequence[TimeSeries] ) -> Iterator[Tuple[TimeSeries, Optional[Union[Sequence[float], float]]]]: if isinstance(self._lmbda, Sequence) and isinstance( self._lmbda[0], Sequence): # CASE 0: Sequence[Sequence[float]] raise_if( len(self._lmbda) != len(series), "with multiple time series the number of lmbdas sequences must equal the number of time \ series", logger) return zip(series, self._lmbda) else: # CASE 1: Sequence[float], float, None. Replicating the same value for each TS lmbda_gen = (self._lmbda for _ in range(len(series))) return zip(series, lmbda_gen)
def ts_fit(series: TimeSeries, lmbda: Optional[Union[float, Sequence[float]]], method) -> Union[Sequence[float], pd.core.series.Series]: if lmbda is None: # Compute optimal lmbda for each dimension of the time series. In this case, the return type is # a pd.core.series.Series, which is not inhering from collections.abs.Sequence lmbda = series._df.apply(boxcox_normmax, method=method) elif isinstance(lmbda, Sequence): raise_if( len(lmbda) != series.width, "lmbda should have one value per dimension (ie. column or variable) of the time series", logger) else: # Replicate lmbda to match dimensions of the time series lmbda = [lmbda] * series.width return lmbda
def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: # by default our models are initialized as float32. For other dtypes, we need to cast to the correct precision # before parameters are loaded by PyTorch-Lightning dtype = checkpoint["model_dtype"] if dtype == torch.float16: self.half() if dtype == torch.float32: self.float() elif dtype == torch.float64: self.double() else: raise_if( True, f"Trying to load dtype {dtype}. Loading for this type is not implemented yet. Please report this " f"issue on https://github.com/unit8co/darts", logger, )
def __init__(self, name: str = "BoxCox", lmbda: Optional[Union[float, Sequence[float], Sequence[Sequence[float]]]] = None, optim_method='mle', n_jobs: int = 1, verbose: bool = False): """ Box-Cox data transformer. See https://otexts.com/fpp2/transformations.html#mathematical-transformations for more information. Parameters ---------- name A specific name for the transformer lmbda If None given, will automatically find an optimal value of lmbda (for each dimension of the time series, for each time series) using `scipy.stats.boxcox_normmax` with `method=optim_method` If a single float is given, the same lmbda value will be used for all dimensions of the series, for all the series. Also allows to specify a different lmbda value for each dimension of the time series by passing a sequence of values (or a sequence of sequence of values in case of multiple time series). optim_method Specifies which method to use to find an optimal value for the lmbda parameter. Either 'mle' or 'pearsonr'. Ignored if lmbda != None. n_jobs The number of jobs to run in parallel. Parallel jobs are created only when a `Sequence[TimeSeries]` is passed as input to a method, parallelising operations regarding different `TimeSeries`. Defaults to `1` (sequential). Setting the parameter to `-1` means using all the available processors. Note: for a small amount of data, the parallelisation overhead could end up increasing the total required amount of time. verbose Optionally, whether to print operations progress """ super().__init__(name=name, n_jobs=n_jobs, verbose=verbose) raise_if( not isinstance(optim_method, str) or optim_method not in ['mle', 'pearsonr'], "optim_method parameter must be either 'mle' or 'pearsonr'", logger) self._lmbda = lmbda self._optim_method = optim_method
def remove_trend( ts: TimeSeries, model: ModelMode = ModelMode.MULTIPLICATIVE, method: str = "naive", **kwargs, ) -> TimeSeries: """ Adjusts the TimeSeries `ts` for a trend using the `model` decomposition. Parameters ---------- ts The TimeSeries to adjust. model The type of decomposition to use. Must be a `from darts import ModelMode` Enum member. Either ModelMode.MULTIPLICATIVE or ModelMode.ADDITIVE. Defaults ModelMode.MULTIPLICATIVE. method The method to be used to decompose the series. - "naive" : Seasonal decomposition using moving averages [1]_. - "STL" : Season-Trend decomposition using LOESS [2]_. Only compatible with ``ADDITIVE`` model type. Defaults to "naive" kwargs Other keyword arguments are passed down to the decomposition method. Returns ------- TimeSeries A new TimeSeries instance that corresponds to the trend-adjusted 'ts'. """ ts._assert_univariate() raise_if( model not in [SeasonalityMode.ADDITIVE, ModelMode.ADDITIVE] and method == "STL", f"Only ADDITIVE seasonality is compatible with the STL method. Current model is {model}.", logger, ) trend, _ = extract_trend_and_seasonality(ts, model=model, method=method, **kwargs) new_ts = remove_from_series(ts, trend, model) return new_ts
def __init__(self, n: int, m: int, ranges: np.ndarray = None): """ Parameters ---------- n The width of the window, must be equal to the length of series1 m The height of the window, must be equal to the length of series2 ranges Ranges of active cells within a column [[start_column0, end_column0], ...] with shape (n, 2) and where start >= 0 and end <= m. """ self.n = n self.m = m if ranges is not None: raise_if_not( ranges.shape == (n, 2), f"Expects a 2d array with [start, end] for each column and shape = ({n}, 2)", ) ranges = np.insert(ranges, 0, [0, 1], axis=0) start = ranges[:, 0] end = ranges[:, 1] raise_if(np.any(start < 0), "Start must be >=0") raise_if(np.any(end > m), "End must be <m") diff = np.maximum(end - start, 0) self.length = np.sum(diff) ranges[1:] += 1 ranges = ranges.flatten() else: ranges = np.zeros((n + 1) * 2, dtype=int) ranges[0::2] = self.m # start ranges[1::2] = 0 # end ranges = array.array("i", ranges) ranges[0] = 0 ranges[1] = 1 self.length = 1 self.column_ranges = array.array("i", ranges)
def __init__(self, forecasting_models: List[ForecastingModel], regression_train_n_points: int, regression_model=LinearRegression(n_jobs=-1, fit_intercept=False)): """ Class for ensemble models using a regression model for ensembling individual models' predictions The provided regression model must implement fit() and predict() methods (e.g. scikit-learn regression models) Parameters ---------- forecasting_models List of forecasting models whose predictions to ensemble regression_train_n_points The number of points to use to train the regression model regression_model Any regression model with predict() and fit() methods (e.g. from scikit-learn) Default: `sklearn.linear_model.LinearRegression(n_jobs=-1, fit_intercept=False)` """ super().__init__(forecasting_models) # wrap provided regression_model in a StandardRegressionModel (if not already the case) if isinstance(regression_model, StandardRegressionModel): # raise exception if train_n_points value is ambiguous model_train_n_points = regression_model.train_n_points raise_if( model_train_n_points is not None and regression_train_n_points != model_train_n_points, "Provided StandardRegressionModel.train_n_points parameter doesn't match specified" " regression_train_n_points parameter.", logger) # if it was None, set regression_model.train_n_points to regression_train_n_points regression_model.train_n_points = regression_train_n_points else: regression_model = StandardRegressionModel( regression_train_n_points, regression_model) self.regression_model = regression_model
def __init__(self, fill: Union[str, float] = 'auto', name: str = "MissingValuesFiller"): """ Data transformer to fill missing values from time series Parameters ---------- fill The value used to replace the missing values. If set to 'auto', will auto-fill missing values using the `pandas.Dataframe.interpolate()` method. name A specific name for the transformer """ raise_if_not(isinstance(fill, str) or isinstance(fill, float), "`fill` should either be a string or a float", logger) raise_if(isinstance(fill, str) and fill != 'auto', "invalid string for `fill`: can only be set to 'auto'", logger) super().__init__(name) self._fill = fill
def fit(self, series: TimeSeries) -> None: super().fit(series) # spare train_n_points points to serve as regression target raise_if( len(self.training_series) <= self.train_n_points, "regression_train_n_points parameter too big (must be smaller or equal" + " to the number of points in training_series)", logger) forecast_training = self.training_series[:-self.train_n_points] regression_target = self.training_series[-self.train_n_points:] # fit the forecasting models for model in self.models: model.fit(forecast_training) # predict train_n_points points for each model predictions = self.models[0].predict(self.train_n_points) for model in self.models[1:]: predictions = predictions.stack(model.predict(self.train_n_points)) # train the regression model on the individual models' predictions self.regression_model.fit(series=regression_target, exog=predictions) # prepare the forecasting models for further predicting by fitting # them with the entire data # Some models (incl. Neural-Network based models) may need to be 'reset' # to allow being retrained from scratch self.models = [ model.untrained_model() if hasattr(model, 'untrained_model') else model for model in self.models ] # fit the forecasting models for model in self.models: model.fit(self.training_series)
def fit( self, series: Union[TimeSeries, Sequence[TimeSeries]], past_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, future_covariates: Optional[Union[TimeSeries, Sequence[TimeSeries]]] = None, ): """ Fits the model on the provided series. Note that `EnsembleModel.fit()` does NOT call `fit()` on each of its constituent forecasting models. It is left to classes inheriting from EnsembleModel to do so appropriately when overriding `fit()` """ raise_if( not self.is_global_ensemble and not isinstance(series, TimeSeries), "The models are not GlobalForecastingModel's and do not support training on multiple series.", logger, ) raise_if( not self.is_global_ensemble and past_covariates is not None, "The models are not GlobalForecastingModel's and do not support past covariates.", logger, ) self.is_single_series = isinstance(series, TimeSeries) # check that if timeseries is single series, than covariates are as well and vice versa error = False if past_covariates is not None: error = self.is_single_series != isinstance( past_covariates, TimeSeries) if future_covariates is not None: error = self.is_single_series != isinstance( future_covariates, TimeSeries) raise_if( error, "Both series and covariates have to be either univariate or multivariate.", logger, ) super().fit(series, past_covariates, future_covariates) return self
def _extend_time_index_until( time_index: Union[pd.DatetimeIndex, pd.RangeIndex], until: Optional[Union[int, str, pd.Timestamp]], add_length: int, ) -> pd.DatetimeIndex: if not add_length and not until: return time_index raise_if( bool(add_length) and bool(until), "set only one of add_length and until") end = time_index[-1] freq = time_index.freq if add_length: raise_if_not( add_length >= 0, f"Expected add_length, by which to extend the time series by, " f"to be positive, got {add_length}", ) try: end += add_length * freq except pd.errors.OutOfBoundsDatetime: raise_log( ValueError( f"the add operation between {end} and {add_length * freq} will overflow" ), logger, ) else: datetime_index = isinstance(time_index, pd.DatetimeIndex) if datetime_index: raise_if_not( isinstance(until, (str, pd.Timestamp)), "Expected valid timestamp for TimeSeries, " "indexed by DatetimeIndex, " f"for parameter until, got {type(end)}", logger, ) else: raise_if_not( isinstance(until, int), "Expected integer for TimeSeries, indexed by RangeIndex, " f"for parameter until, got {type(end)}", logger, ) timestamp = pd.Timestamp(until) if datetime_index else until raise_if_not( timestamp > end, f"Expected until, {timestamp} to lie past end of time index {end}", ) ahead = timestamp - end raise_if_not( (ahead % freq) == pd.Timedelta(0), f"End date must correspond with frequency {freq} of the time axis", logger, ) end = timestamp new_time_index = pd.date_range(start=time_index[0], end=end, freq=freq) return new_time_index
def _process_input_encoders(self, params: Dict) -> Tuple[List, List]: """Processes input and returns two lists of tuples `(encoder_id, attribute)` from relevant encoder parameters at model creation. Parameters ---------- params The `add_encoders` dict used at model creation. Must follow this convention: `{encoder keyword: {temporal keyword: List[attributes]}}` Tuples of `(encoder_id, attribute)` are extracted from `add_encoders` to instantiate the `SingleEncoder` objects: * The `encoder_id` is extracted as follows: str(encoder_kw) + str(temporal_kw) -> 'cyclic' + 'past' -> `encoder_id` = 'cyclic_past' The `encoder_id` is used to map the parameters with the corresponding `SingleEncoder` objects. * The `attribute` is extracted from the values given by values under `temporal_kw` `attribute` = 'month' ... The `attribute` tells the `SingleEncoder` which attribute of the index to encode Raises ------ ValueError 1) if the outermost key is other than (`past`, `future`, `absolute`) 2) if the innermost values are other than type `str` or `Sequence` """ if not params: return [], [] # check input for invalid encoder types invalid_encoders = [ enc for enc in params if enc not in ENCODER_KEYS + TRANSFORMER_KEYS ] raise_if( len(invalid_encoders) > 0, f"Encountered invalid encoder types `{invalid_encoders}` in `add_encoders` parameter at model " f"creation. Supported encoder types are: `{ENCODER_KEYS + TRANSFORMER_KEYS}`.", logger, ) encoders = { enc: params.get(enc, None) for enc in ENCODER_KEYS if params.get(enc, None) } # check input for invalid temporal types invalid_time_params = list() for encoder, t_types in encoders.items(): invalid_time_params += [ t_type for t_type in t_types.keys() if t_type not in VALID_TIME_PARAMS ] raise_if( len(invalid_time_params) > 0, f"Encountered invalid temporal types `{invalid_time_params}` in `add_encoders` parameter at model " f"creation. Supported temporal types are: `{VALID_TIME_PARAMS}`.", logger, ) # convert into tuples of (encoder string identifier, encoder attribute) past_encoders, future_encoders = list(), list() for enc, enc_params in encoders.items(): for enc_time, enc_attr in enc_params.items(): raise_if_not( isinstance(enc_attr, VALID_ENCODER_DTYPES), f"Encountered value `{enc_attr}` of invalid type `{type(enc_attr)}` for encoder " f"`{enc}` in `add_encoders` at model creation. Supported data types are: " f"`{VALID_ENCODER_DTYPES}`.", logger, ) attrs = [enc_attr] if isinstance(enc_attr, str) else enc_attr for attr in attrs: encoder_id = "_".join([enc, enc_time]) if enc_time == PAST: past_encoders.append((encoder_id, attr)) else: future_encoders.append((encoder_id, attr)) for temp_enc, takes_temp, temp in [ (past_encoders, self.takes_past_covariates, "past"), (future_encoders, self.takes_future_covariates, "future"), ]: if temp_enc and not takes_temp: logger.warning( f"Specified {temp} encoders in `add_encoders` at model creation but model does not " f"accept {temp} covariates. {temp} encoders will be ignored." ) past_encoders = past_encoders if self.takes_past_covariates else [] future_encoders = future_encoders if self.takes_future_covariates else [] return past_encoders, future_encoders
def datetime_attribute_timeseries( time_index: Union[pd.DatetimeIndex, TimeSeries], attribute: str, one_hot: bool = False, cyclic: bool = False, until: Optional[Union[int, str, pd.Timestamp]] = None, add_length: int = 0, dtype=np.float64, ) -> TimeSeries: """ Returns a new TimeSeries with index `time_index` and one or more dimensions containing (optionally one-hot encoded or cyclic encoded) pd.DatatimeIndex attribute information derived from the index. Parameters ---------- time_index Either a `pd.DatetimeIndex` attribute which will serve as the basis of the new column(s), or a `TimeSeries` whose time axis will serve this purpose. attribute An attribute of `pd.DatetimeIndex`, or `week` / `weekofyear` / `week_of_year` - e.g. "month", "weekday", "day", "hour", "minute", "second". See all available attributes in https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex. one_hot Boolean value indicating whether to add the specified attribute as a one hot encoding (results in more columns). cyclic Boolean value indicating whether to add the specified attribute as a cyclic encoding. Alternative to one_hot encoding, enable only one of the two. (adds 2 columns, corresponding to sin and cos transformation) until Extend the time_index up until timestamp for datetime indexed series and int for range indexed series, should match or exceed forecasting window. add_length Extend the time_index by add_length, should match or exceed forecasting window. Set only one of until and add_length. dtype The desired NumPy dtype (np.float32 or np.float64) for the resulting series Returns ------- TimeSeries New datetime attribute TimeSeries instance. """ if isinstance(time_index, TimeSeries): time_index = time_index.time_index time_index = _extend_time_index_until(time_index, until, add_length) raise_if_not( hasattr(pd.DatetimeIndex, attribute) or (attribute in ["week", "weekofyear", "week_of_year"]), f"attribute `{attribute}` needs to be an attribute of pd.DatetimeIndex. " "See all available attributes in " "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.html#pandas.DatetimeIndex", logger, ) raise_if(one_hot and cyclic, "set only one of one_hot or cyclic to true", logger) num_values_dict = { "month": 12, "day": 31, "weekday": 7, "dayofweek": 7, "day_of_week": 7, "hour": 24, "minute": 60, "second": 60, "microsecond": 1000000, "nanosecond": 1000, "quarter": 4, "dayofyear": 365, "day_of_year": 365, "week": 52, "weekofyear": 52, "week_of_year": 52, } if attribute not in ["week", "weekofyear", "week_of_year"]: values = getattr(time_index, attribute) else: values = (time_index.isocalendar().set_index("week").index.astype( "int64").rename("time")) if one_hot or cyclic: raise_if_not( attribute in num_values_dict, f"Given datetime attribute `{attribute}` not supported with one-hot or cyclical encoding. " f"Supported datetime attribute: {list(num_values_dict.keys())}", logger, ) if one_hot: values_df = pd.get_dummies(values) # fill missing columns (in case not all values appear in time_index) for i in range(1, num_values_dict[attribute] + 1): if not (i in values_df.columns): values_df[i] = 0 values_df = values_df[range(1, num_values_dict[attribute] + 1)] values_df.columns = [ attribute + "_" + str(column_name) for column_name in values_df.columns ] else: if cyclic: if attribute == "day": periods = [ time_index[i].days_in_month for i in time_index.month ] freq = 2 * np.pi * np.reciprocal(periods) else: period = num_values_dict[attribute] freq = 2 * np.pi / period values_df = pd.DataFrame({ attribute + "_sin": np.sin(freq * values), attribute + "_cos": np.cos(freq * values), }) else: values_df = pd.DataFrame({attribute: values}) values_df.index = time_index return TimeSeries.from_dataframe(values_df).astype(dtype)
def plot_acf( ts: TimeSeries, m: Optional[int] = None, max_lag: int = 24, alpha: float = 0.05, bartlett_confint: bool = True, fig_size: Tuple[int, int] = (10, 5), axis: Optional[plt.axis] = None, ) -> None: """ Plots the ACF of `ts`, highlighting it at lag `m`, with corresponding significance interval. Uses :func:`statsmodels.tsa.stattools.acf` [1]_ Parameters ---------- ts The TimeSeries whose ACF should be plotted. m Optionally, a time lag to highlight on the plot. max_lag The maximal lag order to consider. alpha The confidence interval to display. bartlett_confint The boolean value indicating whether the confidence interval should be calculated using Bartlett's formula. If set to True, the confidence interval can be used in the model identification stage for fitting ARIMA models. If set to False, the confidence interval can be used to test for randomness (i.e. there is no time dependence in the data) of the data. fig_size The size of the figure to be displayed. axis Optionally, an axis object to plot the ACF on. References ---------- .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.acf.html """ ts._assert_univariate() raise_if( max_lag is None or not (1 <= max_lag < len(ts)), "max_lag must be greater than or equal to 1 and less than len(ts).", ) raise_if( m is not None and not (0 <= m <= max_lag), "m must be greater than or equal to 0 and less than or equal to max_lag.", ) raise_if( alpha is None or not (0 < alpha < 1), "alpha must be greater than 0 and less than 1.", ) r, confint = acf( ts.values(), nlags=max_lag, fft=False, alpha=alpha, bartlett_confint=bartlett_confint, ) if axis is None: plt.figure(figsize=fig_size) axis = plt for i in range(len(r)): axis.plot( (i, i), (0, r[i]), color=("#b512b8" if m is not None and i == m else "black"), lw=(1 if m is not None and i == m else 0.5), ) # Adjusts the upper band of the confidence interval to center it on the x axis. upp_band = [confint[lag][1] - r[lag] for lag in range(1, max_lag + 1)] axis.fill_between( np.arange(1, max_lag + 1), upp_band, [-x for x in upp_band], color="#003DFD", alpha=0.25, ) axis.plot((0, max_lag + 1), (0, 0), color="black")
def __init__( self, fill: Union[str, float] = "auto", name: str = "MissingValuesFiller", n_jobs: int = 1, verbose: bool = False, ): """Data transformer to fill missing values from a (sequence of) deterministic ``TimeSeries``. Parameters ---------- fill The value used to replace the missing values. If set to 'auto', will auto-fill missing values using the :func:`pd.Dataframe.interpolate()` method. name A specific name for the transformer n_jobs The number of jobs to run in parallel. Parallel jobs are created only when a ``Sequence[TimeSeries]`` is passed as input to a method, parallelising operations regarding different ``TimeSeries``. Defaults to `1` (sequential). Setting the parameter to `-1` means using all the available processors. Note: for a small amount of data, the parallelisation overhead could end up increasing the total required amount of time. verbose Optionally, whether to print operations progress Examples -------- >>> import numpy as np >>> from darts import TimeSeries >>> from darts.dataprocessing.transformers import MissingValuesFiller >>> values = np.arange(start=0, stop=1, step=0.1) >>> values[5:8] = np.nan >>> series = TimeSeries.from_values(values) >>> transformer = MissingValuesFiller() >>> series_filled = transformer.transform(series) >>> print(series_filled) <TimeSeries (DataArray) (time: 10, component: 1, sample: 1)> array([[[0. ]], [[0.1]], [[0.2]], [[0.3]], [[0.4]], [[0.5]], [[0.6]], [[0.7]], [[0.8]], [[0.9]]]) Coordinates: * time (time) int64 0 1 2 3 4 5 6 7 8 9 * component (component) object '0' Dimensions without coordinates: sample """ raise_if_not( isinstance(fill, str) or isinstance(fill, float), "`fill` should either be a string or a float", logger, ) raise_if( isinstance(fill, str) and fill != "auto", "invalid string for `fill`: can only be set to 'auto'", logger, ) super().__init__(name=name, n_jobs=n_jobs, verbose=verbose) self._fill = fill
def plot_pacf( ts: TimeSeries, m: Optional[int] = None, max_lag: int = 24, method: str = "ywadjusted", alpha: float = 0.05, fig_size: Tuple[int, int] = (10, 5), axis: Optional[plt.axis] = None, ) -> None: """ Plots the Partial ACF of `ts`, highlighting it at lag `m`, with corresponding significance interval. Uses :func:`statsmodels.tsa.stattools.pacf` [1]_ Parameters ---------- ts The TimeSeries whose ACF should be plotted. m Optionally, a time lag to highlight on the plot. max_lag The maximal lag order to consider. method The method to be used for the PACF calculation. - | "yw" or "ywadjusted" : Yule-Walker with sample-size adjustment in | denominator for acovf. Default. - "ywm" or "ywmle" : Yule-Walker without adjustment. - "ols" : regression of time series on lags of it and on constant. - "ols-inefficient" : regression of time series on lags using a single common sample to estimate all pacf coefficients. - "ols-adjusted" : regression of time series on lags with a bias adjustment. - "ld" or "ldadjusted" : Levinson-Durbin recursion with bias correction. - "ldb" or "ldbiased" : Levinson-Durbin recursion without bias correction. alpha The confidence interval to display. fig_size The size of the figure to be displayed. axis Optionally, an axis object to plot the ACF on. References ---------- .. [1] https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.pacf.html """ ts._assert_univariate() raise_if( max_lag is None or not (1 <= max_lag < len(ts) // 2), "max_lag must be greater than or equal to 1 and less than len(ts)//2.", ) raise_if( m is not None and not (0 <= m <= max_lag), "m must be greater than or equal to 0 and less than or equal to max_lag.", ) raise_if( alpha is None or not (0 < alpha < 1), "alpha must be greater than 0 and less than 1.", ) r, confint = pacf(ts.values(), nlags=max_lag, method=method, alpha=alpha) if axis is None: plt.figure(figsize=fig_size) axis = plt for i in range(len(r)): axis.plot( (i, i), (0, r[i]), color=("#b512b8" if m is not None and i == m else "black"), lw=(1 if m is not None and i == m else 0.5), ) # Adjusts the upper band of the confidence interval to center it on the x axis. upp_band = [confint[lag][1] - r[lag] for lag in range(1, max_lag + 1)] axis.fill_between( np.arange(1, max_lag + 1), upp_band, [-x for x in upp_band], color="#003DFD", alpha=0.25, ) axis.plot((0, max_lag + 1), (0, 0), color="black")