示例#1
0
def get_values_or_raise(series_a: TimeSeries,
                        series_b: TimeSeries) -> Tuple[np.ndarray, np.ndarray]:
    """
    Returns the numpy values of two time series, launching an Exception if time series cannot be compared
    """
    assert series_a.has_same_time_as(series_b), 'The two time series must have same time index.' \
                                                '\nFirst series: {}\nSecond series: {}'.format(
                                                series_a.time_index(), series_b.time_index())
    return series_a.values(), series_b.values()
def sine_timeseries(
    value_frequency: float = 0.1,
    value_amplitude: float = 1,
    value_phase: float = 0,
    value_y_offset: float = 0,
    length: int = 10,
    freq: str = 'D',
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a TimeSeries with a sinusoidal value progression with a given frequency, amplitude, phase and y offset.

    :param value_frequency: The number of periods that take place within one time unit given in 'freq'.
    :param value_amplitude: The maximum  difference between any value of the returned TimeSeries and 'y_offset'.
    :param value_phase: The relative position within one period of the first value of the returned TimeSeries (in radians).
    :param value_y_offset: The shift of the sine function along the y axis.
    :param length: The length of the returned TimeSeries.
    :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: A sinusoidal TimeSeries parametrized as indicated above.
    """

    times = pd.date_range(periods=length, freq=freq, start=start_ts)
    values = np.array(range(length), dtype=float)
    f = np.vectorize(lambda x: value_amplitude * math.sin(
        2 * math.pi * value_frequency * x + value_phase) + value_y_offset)
    values = f(values)

    return TimeSeries.from_times_and_values(times, values)
def linear_timeseries(
    start_value: float = 0,
    end_value: float = 1,
    length: int = 10,
    freq: str = 'D',
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a TimeSeries with a starting value of 'start_value' that increases linearly such that
    it takes on the value 'end_value' at the last entry of the TimeSeries. This means that
    the difference between two adjacent entries will be equal to 
    ('end_value' - 'start_value') / ('length' - 1).

    :param start_value: The value of the first entry in the TimeSeries.
    :param end_value: The value of the last entry in the TimeSeries.
    :param length: The length of the returned TimeSeries.
    :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: A linear TimeSeries created as indicated above.
    """

    times = pd.date_range(periods=length, freq=freq, start=start_ts)
    values = np.linspace(start_value, end_value, length)

    return TimeSeries.from_times_and_values(times, values)
示例#4
0
def fillna_val(ts: 'TimeSeries', fill: float = 0) -> 'TimeSeries':
    """
    Fills the missing values of `ts` with only the value provided (default zeroes).

    :param ts: The TimeSeries to check for missing values.
    :param fill: the value used to replace the missing values.
    :return: A TimeSeries, `ts` with al missing values set to 0.
    """
    return TimeSeries.from_times_and_values(ts.time_index(),
                                            ts.pd_series().fillna(value=fill))
示例#5
0
def simulate_forecast_ar(series: 'TimeSeries',
                         model: 'AutoRegressiveModel',
                         start: 'pd.Timestamp',
                         fcast_horizon_n: int,
                         trim_to_series: bool = True,
                         verbose=False) -> 'TimeSeries':
    """
    Provides an environment for forecasting future values of the TimeSeries 'series`.

    This function predicts the `fcast_horizon_n` values for the TimeSeries `series` starting from the date `start`
    according to the auto-regressive model `model`.

    :param series: The TimeSeries to forecast.
    :param model: The AutoRegressiveModel to use.
    :param start: The first time at which a prediction is produced for a future time.
    :param fcast_horizon_n: The number of future values to predict.
    :param trim_to_series: Whether the predicted series has the end trimmed to match the end of the main series or not.
    :param verbose: Whether to print progress or not.
    :return: A TimeSeries containing the fore-casted values of `series` over the horizon with respect to the model \
    `model`.

    """
    raise_if_not(start in series,
                 'The provided start timestamp is not in the time series.',
                 logger)
    raise_if_not(
        start != series.end_time(),
        'The provided start timestamp is the last timestamp of the time series',
        logger)

    last_pred_time = series.time_index()[
        -fcast_horizon_n - 2] if trim_to_series else series.time_index()[-2]

    # build the prediction times in advance (to be able to use tqdm)
    pred_times = [start]
    while pred_times[-1] <= last_pred_time:
        pred_times.append(pred_times[-1] + series.freq())

    # what we'll return
    values = []
    times = []

    iterator = build_tqdm_iterator(pred_times, verbose)

    for pred_time in iterator:
        train = series.drop_after(pred_time)  # build the training series

        model.fit(train)
        pred = model.predict(fcast_horizon_n)
        values.append(pred.values()[-1])  # store the N-th point
        times.append(pred.end_time())  # store the N-th timestamp

    return TimeSeries.from_times_and_values(pd.DatetimeIndex(times),
                                            np.array(values))
示例#6
0
def coefficient_variation(true_series: TimeSeries,
                          pred_series: TimeSeries,
                          time_diff: bool = False) -> float:
    """
    Compute the Root Mean Squared Error (RMSE).

    :param true_series: A TimeSeries.
    :param pred_series: A TimeSeries to be compared with `true_series`.
    :param time_diff: If True, analyze the time differentiated series, instead of the index one.
    :return: A float, the RMSE of `pred_series` with respect to `true_series`.
    """
    return 100 * rmse(true_series, pred_series, time_diff) / true_series.mean()
示例#7
0
def auto_fillna(ts: 'TimeSeries',
                first: float = None,
                last: float = None,
                interpolate: str = 'linear',
                **kwargs) -> 'TimeSeries':
    """
    This function automatically fills the missing value in the TimeSeries `ts`, assuming they are represented by np.nan.

    The rules for completion are given below.

    Missing values at the beginning are filled with constant value `first`. Defaults to backwards-fill.
    Missing values at the end are filled with constant value `last`. Defaults to forward-fill.
    Missing values between to numeric values are set using the interpolation wrapper of pandas with `method`.
    Defaults to linear interpolation.

    Add the option `fill_value` to 'extrapolate' to fill the missing values at the beginning and the end with
    the regression function computed. Must set `first` and `last` to None

    .. todo: be more flexible on the filling methods.

    :param ts: A TimeSeries `ts`.
    :param first: The value to use for filling the beginning of the TimeSeries. Defaults to first known value in `ts`.
    :param last: The value to use for filling the ending of the TimeSeries. Defaults to last known value in `ts`.
    :param interpolate: The function used for filling the middle of the TimeSeries. Defaults to linear interpolation.
    :return: A new TimeSeries with all missing values filled according to the rules above.
    """

    # We compute the number of times entries of the TimeSeries go from missing to numeric and vice-versa
    arr = change_of_state(ts)[1]

    if len(arr) == 0:
        return ts

    ts_temp = ts.pd_series()

    # if first value is missing and `first` is specified, fill values
    if np.isnan(ts.values()[0]) and first is not None:
        ts_temp[:arr[0] + 1] = first

    # if last value is missing and `last` is specified, fill values
    if np.isnan(ts.values()[-1]) and last is not None:
        ts_temp[arr[-1] + 1:] = last

    # pandas interpolate wrapper, with chosen `method`
    ts_temp.interpolate(method=interpolate,
                        inplace=True,
                        limit_direction='both',
                        limit=len(ts_temp),
                        **kwargs)

    return TimeSeries.from_times_and_values(ts.time_index(), ts_temp.values)
示例#8
0
def get_train_val_series(
        series: TimeSeries,
        start: pd.Timestamp,
        nr_points_val: int,
        nr_steps_iter: int = 1) -> List[Tuple[TimeSeries, TimeSeries]]:
    """
    Returns a list of (training_set, validation_set) pairs for backtesting.

    .. todo: this is expanding training window, implement optional sliding window

    :param series: The full time series needs to be split
    :param start: the start time of the earliest validation set
    :param nr_points_val: the number of points in the validation sets
    :param nr_steps_iter: the number of time steps to iterate between the successive validation sets
    :return: a list of (training_set, validation_set) pairs
    """

    raise_if_not(start in series,
                 'The provided start timestamp is not in the time series.',
                 logger)
    raise_if_not(
        start != series.end_time(),
        'The provided start timestamp is the last timestamp of the time series',
        logger)
    # TODO: maybe also check that valset_duration >= series frequency

    curr_val_start: pd.Timestamp = start

    def _get_train_val_and_increase_pointer() -> Tuple[TimeSeries, TimeSeries]:
        nonlocal curr_val_start

        train_series, val_series_all = series.split_after(curr_val_start)
        val_series = val_series_all.slice_n_points_after(
            val_series_all.start_time(), nr_points_val)

        curr_val_start = curr_val_start + nr_steps_iter * series.freq()
        return train_series, val_series

    series_pairs = []
    curr_train_series, curr_val_series = _get_train_val_and_increase_pointer()

    while len(curr_val_series) >= nr_points_val:
        series_pairs.append((curr_train_series, curr_val_series))
        curr_train_series, curr_val_series = _get_train_val_and_increase_pointer(
        )

    return series_pairs
def us_holiday_timeseries(
    length: int = 10,
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a binary TimeSeries that equals 1 at every index that corresponds to a US holiday, 
    and 0 otherwise. The frequency of the TimeSeries is daily.

    :param length: The length of the returned TimeSeries.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: Binary TimeSeries for US holidays.
    """

    times = pd.date_range(periods=length, freq='D', start=start_ts)
    us_holidays = USFederalHolidayCalendar().holidays()
    values = times.isin(us_holidays).astype(int)

    return TimeSeries.from_times_and_values(times, values)
示例#10
0
def simulate_forecast_ar(series: TimeSeries,
                         model: AutoRegressiveModel,
                         start: pd.Timestamp,
                         fcast_horizon_n: int,
                         trim_to_series: bool = True,
                         verbose=False) -> TimeSeries:
    """
    Returns a TimeSeries containing the forecasts that would have been obtained from a given AutoRegressiveModel,
    on a given forecast time horizon.

    :param series: the main series to forecast
    :param model: the AutoRegressiveModel to use
    :param start: when the forecasts start (i.e., the first time at which a prediction is produced for a future time)
    :param fcast_horizon_n: the forecast horizon
    :param trim_to_series: whether the returned predicted series has the end trimmed to match the end of the main series
    :param verbose: whether to print progress
    :return:
    """
    assert start in series, 'The provided start timestamp is not in the time series.'
    assert start != series.end_time(
    ), 'The provided start timestamp is the last timestamp of the time series'

    last_pred_time = series.time_index()[
        -fcast_horizon_n - 2] if trim_to_series else series.time_index()[-2]

    # build the prediction times in advance (to be able to use tqdm)
    pred_times = [start]
    while pred_times[-1] <= last_pred_time:
        pred_times.append(pred_times[-1] + series.freq())

    # what we'll return
    values = []
    times = []

    iterator = _build_iterator(pred_times, verbose)

    for pred_time in iterator:
        if not verbose:
            print('.', end='')
        train = series.drop_end(pred_time)  # build the training series

        model.fit(train)
        pred = model.predict(fcast_horizon_n)
        values.append(pred.values()[-1])  # store the N-th point
        times.append(pred.end_time())  # store the N-th timestamp

    return TimeSeries.from_times_and_values(pd.DatetimeIndex(times),
                                            np.array(values))
def constant_timeseries(
    value: float = 1,
    length: int = 10,
    freq: str = 'D',
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a constant TimeSeries with the given value, length, start date and frequency.

    :param value: The constant value that the TimeSeries object will assume at every index.
    :param length: The length of the returned TimeSeries.
    :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected;
                   see: https://pandas.pydata.org/pandas-docs/stable/user_guide/TimeSeries.html#dateoffset-objects.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: A constant TimeSeries with value 'value'.
    """

    times = pd.date_range(periods=length, freq=freq, start=start_ts)
    values = np.full(length, value)

    return TimeSeries.from_times_and_values(times, values)
def gaussian_timeseries(
    length: int = 10,
    freq: str = 'D',
    mean: Union[float, np.ndarray] = 0,
    std: Union[float, np.ndarray] = 1,
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a gaussian noise TimeSeries by sampling a gaussian distribution with mean 'mean' and 
    standard deviation 'std'. Each value represents a sample of the distribution.
    When the mean is set to 0, it can be considered a white noise TimeSeries.

    :param mean: The mean of the gaussian distribution that is sampled at each step.
                 If a float value is given, the same mean is used at every step.
                 If a numpy.ndarray of floats with the same length as 'length' is
                 given, a different mean is used at each step.
    :param std: The standard deviation of the gaussian distribution that is sampled at each step.
                If a float value is given, the same standard deviation is used at every step.
                If a 'length' x 'length' numpy.ndarray of floats  is given, it will
                be used as covariance matrix for a multivariate gaussian distribution.
    :param length: The length of the returned TimeSeries.
    :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: A white noise TimeSeries created as indicated above.
    """

    if (type(mean) == np.ndarray):
        raise_if_not(
            mean.shape == (length, ),
            'If a vector of means is provided, it requires the same length as the TimeSeries.',
            logger)
    if (type(std) == np.ndarray):
        raise_if_not(std.shape == (length, length), 'If a matrix of standard deviations is provided,' \
                                              ' its shape has to match the length of the TimeSeries.', logger)

    times = pd.date_range(periods=length, freq=freq, start=start_ts)
    values = np.random.normal(mean, std, size=length)

    return TimeSeries.from_times_and_values(times, values)
def random_walk_timeseries(
    length: int = 10,
    freq: str = 'D',
    mean: float = 0,
    std: float = 1,
    start_ts: pd.Timestamp = pd.Timestamp('2000-01-01')
) -> 'TimeSeries':
    """
    Creates a random walk TimeSeries by sampling a gaussian distribution with mean 'mean' and 
    standard deviation 'std'. The first value is one such random sample. Every subsequent value
    is equal to the previous value plus a random sample.

    :param mean: The mean of the gaussian distribution that is sampled at each step.
    :param std: The standard deviation of the gaussian distribution that is sampled at each step.
    :param length: The length of the returned TimeSeries.
    :param freq: The time difference between two adjacent entries in the returned TimeSeries. A DateOffset alias is expected.
    :param start_ts: The time index of the first entry in the returned TimeSeries.
    :return: A random walk TimeSeries created as indicated above.
    """

    times = pd.date_range(periods=length, freq=freq, start=start_ts)
    values = np.cumsum(np.random.normal(mean, std, size=length))

    return TimeSeries.from_times_and_values(times, values)
示例#14
0
def simulate_forecast_regr(feature_series: List[TimeSeries],
                           target_series: TimeSeries,
                           model: RegressiveModel,
                           start: pd.Timestamp,
                           fcast_horizon_n: int,
                           trim_to_series: bool = True,
                           verbose=False) -> TimeSeries:
    """
    Returns a TimeSeries containing the forecasts that would have been obtained from a given RegressiveModel,
    on a given forecast time horizon.

    .. todo: review and add to documentation.
    .. todo: optionally also return weights, when those are available in model
    .. todo: (getattr(model.model, 'coef_', None) is not None)

    :param feature_series: the feature time series of the regressive model
    :param target_series: the target time series of the regressive model (i.e., the series to predict)
    :param model: the RegressiveModel to use
    :param start: when the forecasts start (i.e., the first time at which a prediction is produced for a future time)
    :param fcast_horizon_n: the forecast horizon
    :param trim_to_series: whether the returned predicted series has the end trimmed to match the end of the main series
    :param verbose: whether to print progress
    :return:
    """
    raise_if_not(all([s.has_same_time_as(target_series) for s in feature_series]), 'All provided time series must ' \
                                                                             'have the same time index', logger)
    raise_if_not(start in target_series,
                 'The provided start timestamp is not in the time series.',
                 logger)
    raise_if_not(
        start != target_series.end_time(),
        'The provided start timestamp is the last timestamp of the time series',
        logger)

    last_pred_time = target_series.time_index()[
        -fcast_horizon_n -
        2] if trim_to_series else target_series.time_index()[-2]

    # build the prediction times in advance (to be able to use tqdm)
    pred_times = [start]
    while pred_times[-1] <= last_pred_time:
        pred_times.append(pred_times[-1] + target_series.freq())

    # what we'll return
    values = []
    times = []

    iterator = build_tqdm_iterator(pred_times, verbose)

    for pred_time in iterator:
        # build train/val series
        train_features = [s.drop_after(pred_time) for s in feature_series]
        train_target = target_series.drop_after(pred_time)
        val_features = [
            s.slice_n_points_after(pred_time + target_series.freq(),
                                   fcast_horizon_n) for s in feature_series
        ]

        model.fit(train_features, train_target)
        pred = model.predict(val_features)
        values.append(pred.values()[-1])  # store the N-th point
        times.append(pred.end_time())  # store the N-th timestamp

    return TimeSeries.from_times_and_values(pd.DatetimeIndex(times),
                                            np.array(values))