def get_values_or_raise(series_a: TimeSeries, series_b: TimeSeries) -> Tuple[np.ndarray, np.ndarray]: """ Returns the numpy values of two time series, launching an Exception if time series cannot be compared """ assert series_a.has_same_time_as(series_b), 'The two time series must have same time index.' \ '\nFirst series: {}\nSecond series: {}'.format( series_a.time_index(), series_b.time_index()) return series_a.values(), series_b.values()
def sine_timeseries( value_frequency: float = 0.1, value_amplitude: float = 1, value_phase: float = 0, value_y_offset: float = 0, length: int = 10, freq: str = 'D', start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a TimeSeries with a sinusoidal value progression with a given frequency, amplitude, phase and y offset. :param value_frequency: The number of periods that take place within one time unit given in 'freq'. :param value_amplitude: The maximum difference between any value of the returned TimeSeries and 'y_offset'. :param value_phase: The relative position within one period of the first value of the returned TimeSeries (in radians). :param value_y_offset: The shift of the sine function along the y axis. :param length: The length of the returned TimeSeries. :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: A sinusoidal TimeSeries parametrized as indicated above. """ times = pd.date_range(periods=length, freq=freq, start=start_ts) values = np.array(range(length), dtype=float) f = np.vectorize(lambda x: value_amplitude * math.sin( 2 * math.pi * value_frequency * x + value_phase) + value_y_offset) values = f(values) return TimeSeries.from_times_and_values(times, values)
def linear_timeseries( start_value: float = 0, end_value: float = 1, length: int = 10, freq: str = 'D', start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a TimeSeries with a starting value of 'start_value' that increases linearly such that it takes on the value 'end_value' at the last entry of the TimeSeries. This means that the difference between two adjacent entries will be equal to ('end_value' - 'start_value') / ('length' - 1). :param start_value: The value of the first entry in the TimeSeries. :param end_value: The value of the last entry in the TimeSeries. :param length: The length of the returned TimeSeries. :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: A linear TimeSeries created as indicated above. """ times = pd.date_range(periods=length, freq=freq, start=start_ts) values = np.linspace(start_value, end_value, length) return TimeSeries.from_times_and_values(times, values)
def fillna_val(ts: 'TimeSeries', fill: float = 0) -> 'TimeSeries': """ Fills the missing values of `ts` with only the value provided (default zeroes). :param ts: The TimeSeries to check for missing values. :param fill: the value used to replace the missing values. :return: A TimeSeries, `ts` with al missing values set to 0. """ return TimeSeries.from_times_and_values(ts.time_index(), ts.pd_series().fillna(value=fill))
def simulate_forecast_ar(series: 'TimeSeries', model: 'AutoRegressiveModel', start: 'pd.Timestamp', fcast_horizon_n: int, trim_to_series: bool = True, verbose=False) -> 'TimeSeries': """ Provides an environment for forecasting future values of the TimeSeries 'series`. This function predicts the `fcast_horizon_n` values for the TimeSeries `series` starting from the date `start` according to the auto-regressive model `model`. :param series: The TimeSeries to forecast. :param model: The AutoRegressiveModel to use. :param start: The first time at which a prediction is produced for a future time. :param fcast_horizon_n: The number of future values to predict. :param trim_to_series: Whether the predicted series has the end trimmed to match the end of the main series or not. :param verbose: Whether to print progress or not. :return: A TimeSeries containing the fore-casted values of `series` over the horizon with respect to the model \ `model`. """ raise_if_not(start in series, 'The provided start timestamp is not in the time series.', logger) raise_if_not( start != series.end_time(), 'The provided start timestamp is the last timestamp of the time series', logger) last_pred_time = series.time_index()[ -fcast_horizon_n - 2] if trim_to_series else series.time_index()[-2] # build the prediction times in advance (to be able to use tqdm) pred_times = [start] while pred_times[-1] <= last_pred_time: pred_times.append(pred_times[-1] + series.freq()) # what we'll return values = [] times = [] iterator = build_tqdm_iterator(pred_times, verbose) for pred_time in iterator: train = series.drop_after(pred_time) # build the training series model.fit(train) pred = model.predict(fcast_horizon_n) values.append(pred.values()[-1]) # store the N-th point times.append(pred.end_time()) # store the N-th timestamp return TimeSeries.from_times_and_values(pd.DatetimeIndex(times), np.array(values))
def coefficient_variation(true_series: TimeSeries, pred_series: TimeSeries, time_diff: bool = False) -> float: """ Compute the Root Mean Squared Error (RMSE). :param true_series: A TimeSeries. :param pred_series: A TimeSeries to be compared with `true_series`. :param time_diff: If True, analyze the time differentiated series, instead of the index one. :return: A float, the RMSE of `pred_series` with respect to `true_series`. """ return 100 * rmse(true_series, pred_series, time_diff) / true_series.mean()
def auto_fillna(ts: 'TimeSeries', first: float = None, last: float = None, interpolate: str = 'linear', **kwargs) -> 'TimeSeries': """ This function automatically fills the missing value in the TimeSeries `ts`, assuming they are represented by np.nan. The rules for completion are given below. Missing values at the beginning are filled with constant value `first`. Defaults to backwards-fill. Missing values at the end are filled with constant value `last`. Defaults to forward-fill. Missing values between to numeric values are set using the interpolation wrapper of pandas with `method`. Defaults to linear interpolation. Add the option `fill_value` to 'extrapolate' to fill the missing values at the beginning and the end with the regression function computed. Must set `first` and `last` to None .. todo: be more flexible on the filling methods. :param ts: A TimeSeries `ts`. :param first: The value to use for filling the beginning of the TimeSeries. Defaults to first known value in `ts`. :param last: The value to use for filling the ending of the TimeSeries. Defaults to last known value in `ts`. :param interpolate: The function used for filling the middle of the TimeSeries. Defaults to linear interpolation. :return: A new TimeSeries with all missing values filled according to the rules above. """ # We compute the number of times entries of the TimeSeries go from missing to numeric and vice-versa arr = change_of_state(ts)[1] if len(arr) == 0: return ts ts_temp = ts.pd_series() # if first value is missing and `first` is specified, fill values if np.isnan(ts.values()[0]) and first is not None: ts_temp[:arr[0] + 1] = first # if last value is missing and `last` is specified, fill values if np.isnan(ts.values()[-1]) and last is not None: ts_temp[arr[-1] + 1:] = last # pandas interpolate wrapper, with chosen `method` ts_temp.interpolate(method=interpolate, inplace=True, limit_direction='both', limit=len(ts_temp), **kwargs) return TimeSeries.from_times_and_values(ts.time_index(), ts_temp.values)
def get_train_val_series( series: TimeSeries, start: pd.Timestamp, nr_points_val: int, nr_steps_iter: int = 1) -> List[Tuple[TimeSeries, TimeSeries]]: """ Returns a list of (training_set, validation_set) pairs for backtesting. .. todo: this is expanding training window, implement optional sliding window :param series: The full time series needs to be split :param start: the start time of the earliest validation set :param nr_points_val: the number of points in the validation sets :param nr_steps_iter: the number of time steps to iterate between the successive validation sets :return: a list of (training_set, validation_set) pairs """ raise_if_not(start in series, 'The provided start timestamp is not in the time series.', logger) raise_if_not( start != series.end_time(), 'The provided start timestamp is the last timestamp of the time series', logger) # TODO: maybe also check that valset_duration >= series frequency curr_val_start: pd.Timestamp = start def _get_train_val_and_increase_pointer() -> Tuple[TimeSeries, TimeSeries]: nonlocal curr_val_start train_series, val_series_all = series.split_after(curr_val_start) val_series = val_series_all.slice_n_points_after( val_series_all.start_time(), nr_points_val) curr_val_start = curr_val_start + nr_steps_iter * series.freq() return train_series, val_series series_pairs = [] curr_train_series, curr_val_series = _get_train_val_and_increase_pointer() while len(curr_val_series) >= nr_points_val: series_pairs.append((curr_train_series, curr_val_series)) curr_train_series, curr_val_series = _get_train_val_and_increase_pointer( ) return series_pairs
def us_holiday_timeseries( length: int = 10, start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a binary TimeSeries that equals 1 at every index that corresponds to a US holiday, and 0 otherwise. The frequency of the TimeSeries is daily. :param length: The length of the returned TimeSeries. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: Binary TimeSeries for US holidays. """ times = pd.date_range(periods=length, freq='D', start=start_ts) us_holidays = USFederalHolidayCalendar().holidays() values = times.isin(us_holidays).astype(int) return TimeSeries.from_times_and_values(times, values)
def simulate_forecast_ar(series: TimeSeries, model: AutoRegressiveModel, start: pd.Timestamp, fcast_horizon_n: int, trim_to_series: bool = True, verbose=False) -> TimeSeries: """ Returns a TimeSeries containing the forecasts that would have been obtained from a given AutoRegressiveModel, on a given forecast time horizon. :param series: the main series to forecast :param model: the AutoRegressiveModel to use :param start: when the forecasts start (i.e., the first time at which a prediction is produced for a future time) :param fcast_horizon_n: the forecast horizon :param trim_to_series: whether the returned predicted series has the end trimmed to match the end of the main series :param verbose: whether to print progress :return: """ assert start in series, 'The provided start timestamp is not in the time series.' assert start != series.end_time( ), 'The provided start timestamp is the last timestamp of the time series' last_pred_time = series.time_index()[ -fcast_horizon_n - 2] if trim_to_series else series.time_index()[-2] # build the prediction times in advance (to be able to use tqdm) pred_times = [start] while pred_times[-1] <= last_pred_time: pred_times.append(pred_times[-1] + series.freq()) # what we'll return values = [] times = [] iterator = _build_iterator(pred_times, verbose) for pred_time in iterator: if not verbose: print('.', end='') train = series.drop_end(pred_time) # build the training series model.fit(train) pred = model.predict(fcast_horizon_n) values.append(pred.values()[-1]) # store the N-th point times.append(pred.end_time()) # store the N-th timestamp return TimeSeries.from_times_and_values(pd.DatetimeIndex(times), np.array(values))
def constant_timeseries( value: float = 1, length: int = 10, freq: str = 'D', start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a constant TimeSeries with the given value, length, start date and frequency. :param value: The constant value that the TimeSeries object will assume at every index. :param length: The length of the returned TimeSeries. :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected; see: https://pandas.pydata.org/pandas-docs/stable/user_guide/TimeSeries.html#dateoffset-objects. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: A constant TimeSeries with value 'value'. """ times = pd.date_range(periods=length, freq=freq, start=start_ts) values = np.full(length, value) return TimeSeries.from_times_and_values(times, values)
def gaussian_timeseries( length: int = 10, freq: str = 'D', mean: Union[float, np.ndarray] = 0, std: Union[float, np.ndarray] = 1, start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a gaussian noise TimeSeries by sampling a gaussian distribution with mean 'mean' and standard deviation 'std'. Each value represents a sample of the distribution. When the mean is set to 0, it can be considered a white noise TimeSeries. :param mean: The mean of the gaussian distribution that is sampled at each step. If a float value is given, the same mean is used at every step. If a numpy.ndarray of floats with the same length as 'length' is given, a different mean is used at each step. :param std: The standard deviation of the gaussian distribution that is sampled at each step. If a float value is given, the same standard deviation is used at every step. If a 'length' x 'length' numpy.ndarray of floats is given, it will be used as covariance matrix for a multivariate gaussian distribution. :param length: The length of the returned TimeSeries. :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: A white noise TimeSeries created as indicated above. """ if (type(mean) == np.ndarray): raise_if_not( mean.shape == (length, ), 'If a vector of means is provided, it requires the same length as the TimeSeries.', logger) if (type(std) == np.ndarray): raise_if_not(std.shape == (length, length), 'If a matrix of standard deviations is provided,' \ ' its shape has to match the length of the TimeSeries.', logger) times = pd.date_range(periods=length, freq=freq, start=start_ts) values = np.random.normal(mean, std, size=length) return TimeSeries.from_times_and_values(times, values)
def random_walk_timeseries( length: int = 10, freq: str = 'D', mean: float = 0, std: float = 1, start_ts: pd.Timestamp = pd.Timestamp('2000-01-01') ) -> 'TimeSeries': """ Creates a random walk TimeSeries by sampling a gaussian distribution with mean 'mean' and standard deviation 'std'. The first value is one such random sample. Every subsequent value is equal to the previous value plus a random sample. :param mean: The mean of the gaussian distribution that is sampled at each step. :param std: The standard deviation of the gaussian distribution that is sampled at each step. :param length: The length of the returned TimeSeries. :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected. :param start_ts: The time index of the first entry in the returned TimeSeries. :return: A random walk TimeSeries created as indicated above. """ times = pd.date_range(periods=length, freq=freq, start=start_ts) values = np.cumsum(np.random.normal(mean, std, size=length)) return TimeSeries.from_times_and_values(times, values)
def simulate_forecast_regr(feature_series: List[TimeSeries], target_series: TimeSeries, model: RegressiveModel, start: pd.Timestamp, fcast_horizon_n: int, trim_to_series: bool = True, verbose=False) -> TimeSeries: """ Returns a TimeSeries containing the forecasts that would have been obtained from a given RegressiveModel, on a given forecast time horizon. .. todo: review and add to documentation. .. todo: optionally also return weights, when those are available in model .. todo: (getattr(model.model, 'coef_', None) is not None) :param feature_series: the feature time series of the regressive model :param target_series: the target time series of the regressive model (i.e., the series to predict) :param model: the RegressiveModel to use :param start: when the forecasts start (i.e., the first time at which a prediction is produced for a future time) :param fcast_horizon_n: the forecast horizon :param trim_to_series: whether the returned predicted series has the end trimmed to match the end of the main series :param verbose: whether to print progress :return: """ raise_if_not(all([s.has_same_time_as(target_series) for s in feature_series]), 'All provided time series must ' \ 'have the same time index', logger) raise_if_not(start in target_series, 'The provided start timestamp is not in the time series.', logger) raise_if_not( start != target_series.end_time(), 'The provided start timestamp is the last timestamp of the time series', logger) last_pred_time = target_series.time_index()[ -fcast_horizon_n - 2] if trim_to_series else target_series.time_index()[-2] # build the prediction times in advance (to be able to use tqdm) pred_times = [start] while pred_times[-1] <= last_pred_time: pred_times.append(pred_times[-1] + target_series.freq()) # what we'll return values = [] times = [] iterator = build_tqdm_iterator(pred_times, verbose) for pred_time in iterator: # build train/val series train_features = [s.drop_after(pred_time) for s in feature_series] train_target = target_series.drop_after(pred_time) val_features = [ s.slice_n_points_after(pred_time + target_series.freq(), fcast_horizon_n) for s in feature_series ] model.fit(train_features, train_target) pred = model.predict(val_features) values.append(pred.values()[-1]) # store the N-th point times.append(pred.end_time()) # store the N-th timestamp return TimeSeries.from_times_and_values(pd.DatetimeIndex(times), np.array(values))