예제 #1
0
 def resample_data(self, data, expected_frequency) -> pd.Series:
     if self.resampling_config is None:
         data = data.resample(
             timedelta_to_pandas_freq_str(expected_frequency)).mean()
     else:
         data_resampler = data.resample(
             timedelta_to_pandas_freq_str(expected_frequency), **{
                 k: v
                 for k, v in self.resampling_config.items()
                 if k != "aggregation"
             })
         if "aggregation" not in self.resampling_config:
             data = data_resampler.mean()
         else:
             for agg_name, agg_method in inspect.getmembers(
                     data_resampler, inspect.ismethod):
                 if self.resampling_config["aggregation"] == agg_name:
                     data = agg_method()
                     break
             else:
                 raise IncompatibleModelSpecs(
                     "Cannot find resampling aggregation %s on %s" %
                     (self.resampling_config["aggregation"],
                      data_resampler))
     return data
예제 #2
0
def add_lags(df: pd.DataFrame, column: str, lags: List[int],
             frequency: timedelta) -> Optional[pd.DataFrame]:
    """
    Creates lag columns for a column in the dataframe. Lags are in fifteen minute steps (15T).
    Positive values are lags, while negative values are future values.
    The new columns are named like the lagged column, plus "_l<lag>" (or "_f<lag>" for positive lags (future values)),
    where <lag> is the 15-minute lag value or the translation of it into days, weeks or years.
    In case of positive 'lags' (future values), new columns are named like the lagged column, plus "_f<lag>".

    TODO: We could also review if using statsmodels.tsa.tsatools.add_lag is of interest, but here self-made is probably
          what we need.
    """

    if not lags:
        return df

    # Make sure the DataFrame has rows to accommodate each lag
    max_lag = frequency * max(lags)
    min_lag = frequency * min(lags)
    df_start = min(df.index[0], df.index[0] + min_lag)
    df_end = max(df.index[-1], df.index[-1] + max_lag)
    df = df.reindex(
        pd.date_range(
            start=df_start.astimezone(pytz.utc),
            end=df_end.astimezone(pytz.utc),
            freq=timedelta_to_pandas_freq_str(frequency),
        ))

    lag_names = [l for l in lags]

    for lag in lags:
        lag_name = lag_names[lags.index(lag)]
        df[column + "_" + lag_to_suffix(lag_name)] = df[column].shift(lag)

    return df
예제 #3
0
def test_timedelta_to_pd_freq_str():
    assert timedelta_to_pandas_freq_str(timedelta(seconds=5)) == "5S"
    assert timedelta_to_pandas_freq_str(timedelta(minutes=2)) == "2T"
    assert timedelta_to_pandas_freq_str(timedelta(minutes=15)) == "15T"
    assert timedelta_to_pandas_freq_str(timedelta(hours=1)) == "H"
    assert timedelta_to_pandas_freq_str(timedelta(hours=2)) == "2H"
    assert timedelta_to_pandas_freq_str(timedelta(hours=26)) == "26H"
    assert timedelta_to_pandas_freq_str(timedelta(days=1, hours=2)) == "26H"
    assert timedelta_to_pandas_freq_str(timedelta(days=1)) == "D"
예제 #4
0
def make_rolling_forecasts(
    start: datetime,  # Start of forecast period
    end: datetime,  # End of forecast period
    model_specs: ModelSpecs,
) -> Tuple[pd.Series, ModelState]:
    """
    Repeatedly call make_forecast - for all time steps the desired time window
    (end is excluding).
    The time window of the specs (training + test data) is moved forwards also step by step.
    Will fail if series specs do not allocate enough data.
    May create a model whenever the previous one is outdated.
    Return Pandas.Series as result, as well as the last ModelState.
    """

    # Prepare time range
    for dt in (start, end):
        if dt.tzinfo is None:
            dt.replace(tzinfo=pytz.utc)

    # First, compute one big feature frame, once.
    feature_frame: pd.DataFrame = construct_features(
        (model_specs.start_of_training, end), model_specs)

    pd_frequency = timedelta_to_pandas_freq_str(model_specs.frequency)
    values = pd.Series(index=pd.date_range(
        start, end, freq=pd_frequency, closed="left", tz=start.tzinfo))
    time_step = start
    model = None
    logger.info("Forecasting from %s to %s" % (start, end))
    while time_step < end:
        model, specs = update_model(time_step,
                                    model,
                                    model_specs,
                                    feature_frame=feature_frame).split()
        features = feature_frame.loc[time_step:time_step].iloc[:, 1:]
        values[time_step] = make_forecast_for(model_specs, features, model)
        time_step = time_step + model_specs.frequency

    return values, ModelState(model, model_specs)
예제 #5
0
    def resample_data(
        self,
        data,
        time_window,
        expected_frequency,
    ) -> pd.Series:
        """Resample data to the expected frequency.

        In some cases the data explicitly defines the resolution of events to which the data pertains
        (e.g. timely-beliefs BeliefsDataFrames).
        Otherwise, the inferred frequency of the data is used as the event resolution.
        """

        # Monkeypatch resampler only if needed
        if self.resampling_config["upsampling_method"] == "reverse_sum":

            def reverse_sum(resampler, **kwargs):
                """https://stackoverflow.com/questions/54877205#68019138"""
                s = resampler.asfreq()
                return s.fillna(0).groupby(
                    s.notna().cumsum()).transform("mean")

            from pandas.core.resample import Resampler

            Resampler.reverse_sum = reverse_sum

        # Get or infer event resolution
        event_resolution = (
            # set through resampling config
            self.resampling_config.pop("event_resolution")
            if "event_resolution" in self.resampling_config
            # timely-beliefs compatibility
            else data.event_resolution if hasattr(data, "event_resolution")
            # use data frequency
            else
            pd.infer_freq(data.index) if pd.infer_freq(data.index) is not None
            # best guess from time window
            else time_window / len(data) if time_window is not None
            # best guess from data index
            else (data.index[-1] - data.index[0]) / len(data) if len(data) != 0
            # assume instantaneous values, which can't be upsampled
            else timedelta())
        resample_ratio = pd.to_timedelta(
            to_offset(event_resolution)) // pd.Timedelta(expected_frequency)
        if resample_ratio == 1:
            # fill in missing frequency and abort resampling
            data.index.freq = expected_frequency
            return data

        # Convert to PeriodIndex for desired behavior under resampling
        # (specifically, binning the right-most period such that the time series is resampled all
        # the way to the end of the last period instead of just until the start of the last period)
        # - remember timezone before resampling, see https://github.com/pandas-dev/pandas/issues/28039
        # - do not convert if resolution is 0, because PeriodIndex with 0 frequency is not allowed
        tz = data.index.tzinfo
        if event_resolution != timedelta(hours=0):
            data.index = data.index.to_period(event_resolution)

        data_resampler = data.resample(
            timedelta_to_pandas_freq_str(expected_frequency),
            **{
                k: v
                for k, v in self.resampling_config.items()
                if k not in ("downsampling_method", "upsampling_method")
            },
        )

        # Choose between upsampling or downsampling
        if resample_ratio < 1:
            up_or_down_sampling = "down"
        else:
            up_or_down_sampling = "up"

        # Apply resampling method
        for resampling_method_name, resampling_method in inspect.getmembers(
                data_resampler, inspect.ismethod):
            if (self.resampling_config[f"{up_or_down_sampling}sampling_method"]
                    == resampling_method_name):
                if up_or_down_sampling == "up":
                    # Fill NaN values introduced by upsampling, but no more than that
                    data = resampling_method(limit=resample_ratio - 1)
                else:
                    data = resampling_method()
                break
        else:
            raise IncompatibleModelSpecs(
                f"Cannot find {up_or_down_sampling}sampling method %s on %s" %
                (
                    self.
                    resampling_config[f"{up_or_down_sampling}sampling_method"],
                    data_resampler,
                ))

        # Convert to DatetimeIndex and place back timezone
        if event_resolution != timedelta(hours=0):
            data.index = data.index.to_timestamp().tz_localize(tz)
        data.index.freq = expected_frequency

        return data
예제 #6
0
    def load_series(
        self,
        expected_frequency: timedelta,
        time_window: Tuple[datetime, datetime] = None,
        transform_features: bool = False,
        check_time_window: bool = False,
    ) -> pd.Series:
        """Load the series data, check compatibility of series data with model specs
        and perform feature transformation, if needed.

        The actual implementation how to load is deferred to _load_series. Overwrite that for new subclasses.

        This function resamples data if the frequency is not equal to the expected frequency.
        It is possible to customise this resampling (without that, we aggregate means after default resampling).
        To customize resampling, pass in a `resampling_config` argument when you initialize a SeriesSpecs,
        with downsampling_method and upsampling_method names (e.g. "mean" and "pad", respectively),
        an optional event_resolution, and keyword params which are to be passed into `pandas.Series.resample`.
        For example:

        ```
        resampling_config=dict(
            event_resolution=timedelta(hours=1),
            closed="left",
            downsampling_method="sum",
            upsampling_method="reverse_sum",
        )
        ```

        Here:
        - the event resolution describes that, before resampling,
          each data point denotes a value over a 1-hour period.
          Being explicit about the event resolution is especially important when the data frequency
          is not the same as the event resolution, for example, in the case of
          upsampling a time series containing hourly averages with only one data point per day.
        - closed="left" will become an argument to `pandas.Series.resample`.
          It denotes that each period in the time series is indexed by its start time.
        - When downsampling, `pandas.Series.resample().sum()` will be called.
        - When upsampling, `pandas.Series.resample().reverse_sum()` will be called.
        Acceptable values for downsampling_method and upsampling_method are (string names of) possible
        re-sample methods offered by pandas plus a timetomodel-version of "reverse_sum".
        "mean" and "pad" are the default values for downsampling and upsampling, respectively.

        Similarly, pass in an `interpolation_config` to the class with keyword params to pass into
        `pandas.Series.interpolate`. For example, to fill gaps of at most 1 consecutive NaN value through
        interpolation of the time index (note that interpolation happens after resampling):

        `interpolation_config=dict(method="time", limit=1)`

        To be able to upsample, make sure a time_window is set.
        The time window is a tuple stating the index of the first and the index of the last data point.

        You can check if a time window would be feasible, i.e. if enough data is loaded, and get suggestions.
        """
        data = self._load_series().sort_index()

        # check if data has a DateTimeIndex
        if not isinstance(data.index, pd.DatetimeIndex):
            raise IncompatibleModelSpecs(
                "Loaded series has no DatetimeIndex, but %s" %
                type(data.index).__name__)

        # make sure we have a time zone (default to UTC), save original time zone
        if data.index.tzinfo is None:
            self.original_tz = pytz.utc
            data.index = data.index.tz_localize(self.original_tz)
        else:
            self.original_tz = data.index.tzinfo

        # interpret naive time_window in timezone of data
        if time_window is not None and time_window[0].tzinfo is None:
            time_window = (self.original_tz.localize(time_window[0]),
                           time_window[1])
        if time_window is not None and time_window[1].tzinfo is None:
            time_window = (time_window[0],
                           self.original_tz.localize(time_window[1]))

        # check if time series frequency is okay, if not then resample, and check again
        if data.index.freqstr != timedelta_to_pandas_freq_str(
                expected_frequency):
            data = self.resample_data(
                data=data,
                expected_frequency=expected_frequency,
                time_window=time_window,
            )
            assert data.index.freqstr == timedelta_to_pandas_freq_str(
                expected_frequency)

        # Raise error if data is empty or contains nan values
        if data.empty:
            raise MissingData(
                "No values found in requested %s data. It's no use to continue I'm afraid."
            )
        if data.isnull().values.any() and self.interpolation_config is None:
            raise NaNData(
                "Nan values found in the requested %s data. It's no use to continue I'm afraid."
            )

        # check if we have enough data for the expected time window
        if check_time_window and time_window is not None:
            error_msg = ""
            if data.index[0] > time_window[0]:
                error_msg += (
                    "Data for %s starts too late (at %s), while we need data from %s "
                    % (
                        self.name,
                        data.index[0],
                        time_window[0].astimezone(data.index[0].tzinfo),
                    ))
            if data.index[-1] < time_window[1]:
                error_msg += (
                    "Data for %s ends too early (at %s), while we need data until %s "
                    % (
                        self.name,
                        data.index[-1],
                        time_window[1].astimezone(data.index[-1].tzinfo),
                    ))
            if error_msg:
                raise MissingData(error_msg)

            if data.index.freqstr != timedelta_to_pandas_freq_str(
                    expected_frequency):
                raise IncompatibleModelSpecs(
                    "Loaded data for %s has different frequency (%s) than used in model specs expect (%s)."
                    % (
                        self.name,
                        data.index.freqstr,
                        timedelta_to_pandas_freq_str(expected_frequency),
                    ))

        # interpolate after the frequency is set (setting the frequency may have created additional nan values)
        if self.interpolation_config is not None:
            data = self.interpolate_data(data)

        if transform_features and self.feature_transformation is not None:
            data = self.feature_transformation.transform_series(data)

        return data
예제 #7
0
    def load_series(
        self,
        expected_frequency: timedelta,
        transform_features: bool = False,
        check_time_window: Optional[Tuple[datetime, datetime]] = None,
    ) -> pd.Series:
        """Load the series data, check compatibility of series data with model specs
           and perform feature transformation, if needed.

           The actual implementation how to load is deferred to _load_series. Overwrite that for new subclasses.

           This function resamples data if the frequency is not equal to the expected frequency.
           It is possible to customise this resampling (without that, we aggregate means after default resampling).
           To customize resampling, pass in a `resampling_config` argument when you initialize a SeriesSpecs,
           with an aggregation method name (e.g. "mean") and kw params which are to be passed into
           `pandas.Series.resample`. For example:

           `resampling_config={"closed": "left", "aggregation": "sum"}`

           Similarly, pass in an `interpolation_config` to the class with kw params to pass into 
           `pandas.Series.interpolate`. For example, to fill gaps of at most 1 consecutive NaN value through
           interpolation of the time index:

           `interpolation_config={"method": "time", "limit": 1}`

           You can check if a time window would be feasible, i.e. if enough data is loaded, and get suggestions.
           Be sure to pass datetimes with tzinfo compatible to your data.
        """
        data = self._load_series().sort_index()

        # check if data has a DateTimeIndex
        if not isinstance(data.index, pd.DatetimeIndex):
            raise IncompatibleModelSpecs(
                "Loaded series has no DatetimeIndex, but %s" %
                type(data.index).__name__)

        # make sure we have a time zone (default to UTC), save original time zone
        if data.index.tzinfo is None:
            self.original_tz = pytz.utc
            data.index = data.index.tz_localize(self.original_tz)
        else:
            self.original_tz = data.index.tzinfo

        if self.interpolation_config is not None:
            data = self.interpolate_data(data)

        # Raise error if data is empty or contains nan values
        if data.empty:
            raise MissingData(
                "No values found in requested %s data. It's no use to continue I'm afraid."
            )
        if data.isnull().values.any():
            raise NaNData(
                "Nan values found in the requested %s data. It's no use to continue I'm afraid."
            )

        # check if we have enough data for the expected time window
        if check_time_window is not None:
            error_msg = ""
            if data.index[0] > check_time_window[0]:
                error_msg += (
                    "Data starts too late (at %s), while we need data from %s"
                    % (data.index[0], check_time_window[0]))
            if data.index[-1] < check_time_window[1]:
                error_msg += (
                    "Data ends too early (at %s), while we need data until %s"
                    % (data.index[-1], check_time_window[1]))
            if error_msg:
                raise MissingData(error_msg)

        # check if time series frequency is okay, if not then resample, and check again
        if data.index.freqstr != timedelta_to_pandas_freq_str(
                expected_frequency):
            data = self.resample_data(data, expected_frequency)

            if data.index.freqstr != timedelta_to_pandas_freq_str(
                    expected_frequency):
                raise IncompatibleModelSpecs(
                    "Loaded data for %s has different frequency (%s) than used in model specs expect (%s)."
                    % (
                        self.name,
                        data.index.freqstr,
                        timedelta_to_pandas_freq_str(expected_frequency),
                    ))

        if transform_features and self.feature_transformation is not None:
            data = self.feature_transformation.transform_series(data)

        return data
예제 #8
0
def get_time_steps(time_range: Union[str, datetime, Tuple[datetime, datetime]],
                   specs: ModelSpecs) -> pd.DatetimeIndex:
    """ Get relevant datetime indices to build features for.

        The time_range parameter can be one or two datetime objects, in which case this function builds a DateTimeIndex.
        It can also be one of two strings: "train" or "test". In this situation, this function creates a training or
        testing period from model specs.

        TODO: we can check (and complain) if datetime objects are incompatible to specs.frequency
              e.g. if round_datetime(dt, by_seconds=specs.frequency.total_seconds()) != dt:
                       raise Exception("%s is not compatible with frequency %s." % (dt, specs.frequency))
              We have to discuss if we allow to use any time to start intervals or rather 15:00, 15:15, 15:30 etc ...
    """
    # check valid time_range parameter
    if not (isinstance(time_range, datetime) or
            (isinstance(time_range, tuple)
             and isinstance(time_range[0], datetime)
             and isinstance(time_range[1], datetime)) or
            (isinstance(time_range, str) and time_range in ("train", "test"))):
        raise Exception(
            "Goal for DateTimeIndex construction needs to be either a string ('train', 'test'),"
            "a tuple of two datetime objects or one datetime object.")

    pd_frequency = timedelta_to_pandas_freq_str(specs.frequency)

    # easy cases: one or two datetime objects
    if isinstance(time_range, datetime):
        return pd.date_range(time_range,
                             time_range,
                             closed="left",
                             freq=pd_frequency)
    elif isinstance(time_range, tuple):
        if not timedelta_fits_into(specs.frequency,
                                   time_range[1] - time_range[0]):
            raise Exception(
                "Start & end period (%s to %s) does not cleanly fit a multiple of the model frequency (%s)"
                % (time_range[0], time_range[1], specs.frequency))
        return pd.date_range(time_range[0],
                             time_range[1],
                             closed="left",
                             freq=pd_frequency)

    # special cases: "train" or "test" - we have to calculate from model specs
    length_of_data = specs.end_of_testing - specs.start_of_training
    if time_range == "train":
        end_of_training = (specs.start_of_training +
                           length_of_data * specs.ratio_training_testing_data)
        end_of_training = round_datetime(end_of_training,
                                         specs.frequency.total_seconds())
        logger.debug("Start of training: %s" % specs.start_of_training)
        logger.debug("End of training: %s" % end_of_training)
        return pd.date_range(specs.start_of_training,
                             end_of_training,
                             freq=pd_frequency)
    elif time_range == "test":
        start_of_testing = (
            specs.start_of_training +
            (length_of_data * specs.ratio_training_testing_data) +
            specs.frequency)
        start_of_testing = round_datetime(start_of_testing,
                                          specs.frequency.total_seconds())
        logger.debug("Start of testing: %s" % start_of_testing)
        logger.debug("End of testing: %s" % specs.end_of_testing)
        return pd.date_range(start_of_testing,
                             specs.end_of_testing,
                             freq=pd_frequency)