def resample_data(self, data, expected_frequency) -> pd.Series: if self.resampling_config is None: data = data.resample( timedelta_to_pandas_freq_str(expected_frequency)).mean() else: data_resampler = data.resample( timedelta_to_pandas_freq_str(expected_frequency), **{ k: v for k, v in self.resampling_config.items() if k != "aggregation" }) if "aggregation" not in self.resampling_config: data = data_resampler.mean() else: for agg_name, agg_method in inspect.getmembers( data_resampler, inspect.ismethod): if self.resampling_config["aggregation"] == agg_name: data = agg_method() break else: raise IncompatibleModelSpecs( "Cannot find resampling aggregation %s on %s" % (self.resampling_config["aggregation"], data_resampler)) return data
def add_lags(df: pd.DataFrame, column: str, lags: List[int], frequency: timedelta) -> Optional[pd.DataFrame]: """ Creates lag columns for a column in the dataframe. Lags are in fifteen minute steps (15T). Positive values are lags, while negative values are future values. The new columns are named like the lagged column, plus "_l<lag>" (or "_f<lag>" for positive lags (future values)), where <lag> is the 15-minute lag value or the translation of it into days, weeks or years. In case of positive 'lags' (future values), new columns are named like the lagged column, plus "_f<lag>". TODO: We could also review if using statsmodels.tsa.tsatools.add_lag is of interest, but here self-made is probably what we need. """ if not lags: return df # Make sure the DataFrame has rows to accommodate each lag max_lag = frequency * max(lags) min_lag = frequency * min(lags) df_start = min(df.index[0], df.index[0] + min_lag) df_end = max(df.index[-1], df.index[-1] + max_lag) df = df.reindex( pd.date_range( start=df_start.astimezone(pytz.utc), end=df_end.astimezone(pytz.utc), freq=timedelta_to_pandas_freq_str(frequency), )) lag_names = [l for l in lags] for lag in lags: lag_name = lag_names[lags.index(lag)] df[column + "_" + lag_to_suffix(lag_name)] = df[column].shift(lag) return df
def test_timedelta_to_pd_freq_str(): assert timedelta_to_pandas_freq_str(timedelta(seconds=5)) == "5S" assert timedelta_to_pandas_freq_str(timedelta(minutes=2)) == "2T" assert timedelta_to_pandas_freq_str(timedelta(minutes=15)) == "15T" assert timedelta_to_pandas_freq_str(timedelta(hours=1)) == "H" assert timedelta_to_pandas_freq_str(timedelta(hours=2)) == "2H" assert timedelta_to_pandas_freq_str(timedelta(hours=26)) == "26H" assert timedelta_to_pandas_freq_str(timedelta(days=1, hours=2)) == "26H" assert timedelta_to_pandas_freq_str(timedelta(days=1)) == "D"
def make_rolling_forecasts( start: datetime, # Start of forecast period end: datetime, # End of forecast period model_specs: ModelSpecs, ) -> Tuple[pd.Series, ModelState]: """ Repeatedly call make_forecast - for all time steps the desired time window (end is excluding). The time window of the specs (training + test data) is moved forwards also step by step. Will fail if series specs do not allocate enough data. May create a model whenever the previous one is outdated. Return Pandas.Series as result, as well as the last ModelState. """ # Prepare time range for dt in (start, end): if dt.tzinfo is None: dt.replace(tzinfo=pytz.utc) # First, compute one big feature frame, once. feature_frame: pd.DataFrame = construct_features( (model_specs.start_of_training, end), model_specs) pd_frequency = timedelta_to_pandas_freq_str(model_specs.frequency) values = pd.Series(index=pd.date_range( start, end, freq=pd_frequency, closed="left", tz=start.tzinfo)) time_step = start model = None logger.info("Forecasting from %s to %s" % (start, end)) while time_step < end: model, specs = update_model(time_step, model, model_specs, feature_frame=feature_frame).split() features = feature_frame.loc[time_step:time_step].iloc[:, 1:] values[time_step] = make_forecast_for(model_specs, features, model) time_step = time_step + model_specs.frequency return values, ModelState(model, model_specs)
def resample_data( self, data, time_window, expected_frequency, ) -> pd.Series: """Resample data to the expected frequency. In some cases the data explicitly defines the resolution of events to which the data pertains (e.g. timely-beliefs BeliefsDataFrames). Otherwise, the inferred frequency of the data is used as the event resolution. """ # Monkeypatch resampler only if needed if self.resampling_config["upsampling_method"] == "reverse_sum": def reverse_sum(resampler, **kwargs): """https://stackoverflow.com/questions/54877205#68019138""" s = resampler.asfreq() return s.fillna(0).groupby( s.notna().cumsum()).transform("mean") from pandas.core.resample import Resampler Resampler.reverse_sum = reverse_sum # Get or infer event resolution event_resolution = ( # set through resampling config self.resampling_config.pop("event_resolution") if "event_resolution" in self.resampling_config # timely-beliefs compatibility else data.event_resolution if hasattr(data, "event_resolution") # use data frequency else pd.infer_freq(data.index) if pd.infer_freq(data.index) is not None # best guess from time window else time_window / len(data) if time_window is not None # best guess from data index else (data.index[-1] - data.index[0]) / len(data) if len(data) != 0 # assume instantaneous values, which can't be upsampled else timedelta()) resample_ratio = pd.to_timedelta( to_offset(event_resolution)) // pd.Timedelta(expected_frequency) if resample_ratio == 1: # fill in missing frequency and abort resampling data.index.freq = expected_frequency return data # Convert to PeriodIndex for desired behavior under resampling # (specifically, binning the right-most period such that the time series is resampled all # the way to the end of the last period instead of just until the start of the last period) # - remember timezone before resampling, see https://github.com/pandas-dev/pandas/issues/28039 # - do not convert if resolution is 0, because PeriodIndex with 0 frequency is not allowed tz = data.index.tzinfo if event_resolution != timedelta(hours=0): data.index = data.index.to_period(event_resolution) data_resampler = data.resample( timedelta_to_pandas_freq_str(expected_frequency), **{ k: v for k, v in self.resampling_config.items() if k not in ("downsampling_method", "upsampling_method") }, ) # Choose between upsampling or downsampling if resample_ratio < 1: up_or_down_sampling = "down" else: up_or_down_sampling = "up" # Apply resampling method for resampling_method_name, resampling_method in inspect.getmembers( data_resampler, inspect.ismethod): if (self.resampling_config[f"{up_or_down_sampling}sampling_method"] == resampling_method_name): if up_or_down_sampling == "up": # Fill NaN values introduced by upsampling, but no more than that data = resampling_method(limit=resample_ratio - 1) else: data = resampling_method() break else: raise IncompatibleModelSpecs( f"Cannot find {up_or_down_sampling}sampling method %s on %s" % ( self. resampling_config[f"{up_or_down_sampling}sampling_method"], data_resampler, )) # Convert to DatetimeIndex and place back timezone if event_resolution != timedelta(hours=0): data.index = data.index.to_timestamp().tz_localize(tz) data.index.freq = expected_frequency return data
def load_series( self, expected_frequency: timedelta, time_window: Tuple[datetime, datetime] = None, transform_features: bool = False, check_time_window: bool = False, ) -> pd.Series: """Load the series data, check compatibility of series data with model specs and perform feature transformation, if needed. The actual implementation how to load is deferred to _load_series. Overwrite that for new subclasses. This function resamples data if the frequency is not equal to the expected frequency. It is possible to customise this resampling (without that, we aggregate means after default resampling). To customize resampling, pass in a `resampling_config` argument when you initialize a SeriesSpecs, with downsampling_method and upsampling_method names (e.g. "mean" and "pad", respectively), an optional event_resolution, and keyword params which are to be passed into `pandas.Series.resample`. For example: ``` resampling_config=dict( event_resolution=timedelta(hours=1), closed="left", downsampling_method="sum", upsampling_method="reverse_sum", ) ``` Here: - the event resolution describes that, before resampling, each data point denotes a value over a 1-hour period. Being explicit about the event resolution is especially important when the data frequency is not the same as the event resolution, for example, in the case of upsampling a time series containing hourly averages with only one data point per day. - closed="left" will become an argument to `pandas.Series.resample`. It denotes that each period in the time series is indexed by its start time. - When downsampling, `pandas.Series.resample().sum()` will be called. - When upsampling, `pandas.Series.resample().reverse_sum()` will be called. Acceptable values for downsampling_method and upsampling_method are (string names of) possible re-sample methods offered by pandas plus a timetomodel-version of "reverse_sum". "mean" and "pad" are the default values for downsampling and upsampling, respectively. Similarly, pass in an `interpolation_config` to the class with keyword params to pass into `pandas.Series.interpolate`. For example, to fill gaps of at most 1 consecutive NaN value through interpolation of the time index (note that interpolation happens after resampling): `interpolation_config=dict(method="time", limit=1)` To be able to upsample, make sure a time_window is set. The time window is a tuple stating the index of the first and the index of the last data point. You can check if a time window would be feasible, i.e. if enough data is loaded, and get suggestions. """ data = self._load_series().sort_index() # check if data has a DateTimeIndex if not isinstance(data.index, pd.DatetimeIndex): raise IncompatibleModelSpecs( "Loaded series has no DatetimeIndex, but %s" % type(data.index).__name__) # make sure we have a time zone (default to UTC), save original time zone if data.index.tzinfo is None: self.original_tz = pytz.utc data.index = data.index.tz_localize(self.original_tz) else: self.original_tz = data.index.tzinfo # interpret naive time_window in timezone of data if time_window is not None and time_window[0].tzinfo is None: time_window = (self.original_tz.localize(time_window[0]), time_window[1]) if time_window is not None and time_window[1].tzinfo is None: time_window = (time_window[0], self.original_tz.localize(time_window[1])) # check if time series frequency is okay, if not then resample, and check again if data.index.freqstr != timedelta_to_pandas_freq_str( expected_frequency): data = self.resample_data( data=data, expected_frequency=expected_frequency, time_window=time_window, ) assert data.index.freqstr == timedelta_to_pandas_freq_str( expected_frequency) # Raise error if data is empty or contains nan values if data.empty: raise MissingData( "No values found in requested %s data. It's no use to continue I'm afraid." ) if data.isnull().values.any() and self.interpolation_config is None: raise NaNData( "Nan values found in the requested %s data. It's no use to continue I'm afraid." ) # check if we have enough data for the expected time window if check_time_window and time_window is not None: error_msg = "" if data.index[0] > time_window[0]: error_msg += ( "Data for %s starts too late (at %s), while we need data from %s " % ( self.name, data.index[0], time_window[0].astimezone(data.index[0].tzinfo), )) if data.index[-1] < time_window[1]: error_msg += ( "Data for %s ends too early (at %s), while we need data until %s " % ( self.name, data.index[-1], time_window[1].astimezone(data.index[-1].tzinfo), )) if error_msg: raise MissingData(error_msg) if data.index.freqstr != timedelta_to_pandas_freq_str( expected_frequency): raise IncompatibleModelSpecs( "Loaded data for %s has different frequency (%s) than used in model specs expect (%s)." % ( self.name, data.index.freqstr, timedelta_to_pandas_freq_str(expected_frequency), )) # interpolate after the frequency is set (setting the frequency may have created additional nan values) if self.interpolation_config is not None: data = self.interpolate_data(data) if transform_features and self.feature_transformation is not None: data = self.feature_transformation.transform_series(data) return data
def load_series( self, expected_frequency: timedelta, transform_features: bool = False, check_time_window: Optional[Tuple[datetime, datetime]] = None, ) -> pd.Series: """Load the series data, check compatibility of series data with model specs and perform feature transformation, if needed. The actual implementation how to load is deferred to _load_series. Overwrite that for new subclasses. This function resamples data if the frequency is not equal to the expected frequency. It is possible to customise this resampling (without that, we aggregate means after default resampling). To customize resampling, pass in a `resampling_config` argument when you initialize a SeriesSpecs, with an aggregation method name (e.g. "mean") and kw params which are to be passed into `pandas.Series.resample`. For example: `resampling_config={"closed": "left", "aggregation": "sum"}` Similarly, pass in an `interpolation_config` to the class with kw params to pass into `pandas.Series.interpolate`. For example, to fill gaps of at most 1 consecutive NaN value through interpolation of the time index: `interpolation_config={"method": "time", "limit": 1}` You can check if a time window would be feasible, i.e. if enough data is loaded, and get suggestions. Be sure to pass datetimes with tzinfo compatible to your data. """ data = self._load_series().sort_index() # check if data has a DateTimeIndex if not isinstance(data.index, pd.DatetimeIndex): raise IncompatibleModelSpecs( "Loaded series has no DatetimeIndex, but %s" % type(data.index).__name__) # make sure we have a time zone (default to UTC), save original time zone if data.index.tzinfo is None: self.original_tz = pytz.utc data.index = data.index.tz_localize(self.original_tz) else: self.original_tz = data.index.tzinfo if self.interpolation_config is not None: data = self.interpolate_data(data) # Raise error if data is empty or contains nan values if data.empty: raise MissingData( "No values found in requested %s data. It's no use to continue I'm afraid." ) if data.isnull().values.any(): raise NaNData( "Nan values found in the requested %s data. It's no use to continue I'm afraid." ) # check if we have enough data for the expected time window if check_time_window is not None: error_msg = "" if data.index[0] > check_time_window[0]: error_msg += ( "Data starts too late (at %s), while we need data from %s" % (data.index[0], check_time_window[0])) if data.index[-1] < check_time_window[1]: error_msg += ( "Data ends too early (at %s), while we need data until %s" % (data.index[-1], check_time_window[1])) if error_msg: raise MissingData(error_msg) # check if time series frequency is okay, if not then resample, and check again if data.index.freqstr != timedelta_to_pandas_freq_str( expected_frequency): data = self.resample_data(data, expected_frequency) if data.index.freqstr != timedelta_to_pandas_freq_str( expected_frequency): raise IncompatibleModelSpecs( "Loaded data for %s has different frequency (%s) than used in model specs expect (%s)." % ( self.name, data.index.freqstr, timedelta_to_pandas_freq_str(expected_frequency), )) if transform_features and self.feature_transformation is not None: data = self.feature_transformation.transform_series(data) return data
def get_time_steps(time_range: Union[str, datetime, Tuple[datetime, datetime]], specs: ModelSpecs) -> pd.DatetimeIndex: """ Get relevant datetime indices to build features for. The time_range parameter can be one or two datetime objects, in which case this function builds a DateTimeIndex. It can also be one of two strings: "train" or "test". In this situation, this function creates a training or testing period from model specs. TODO: we can check (and complain) if datetime objects are incompatible to specs.frequency e.g. if round_datetime(dt, by_seconds=specs.frequency.total_seconds()) != dt: raise Exception("%s is not compatible with frequency %s." % (dt, specs.frequency)) We have to discuss if we allow to use any time to start intervals or rather 15:00, 15:15, 15:30 etc ... """ # check valid time_range parameter if not (isinstance(time_range, datetime) or (isinstance(time_range, tuple) and isinstance(time_range[0], datetime) and isinstance(time_range[1], datetime)) or (isinstance(time_range, str) and time_range in ("train", "test"))): raise Exception( "Goal for DateTimeIndex construction needs to be either a string ('train', 'test')," "a tuple of two datetime objects or one datetime object.") pd_frequency = timedelta_to_pandas_freq_str(specs.frequency) # easy cases: one or two datetime objects if isinstance(time_range, datetime): return pd.date_range(time_range, time_range, closed="left", freq=pd_frequency) elif isinstance(time_range, tuple): if not timedelta_fits_into(specs.frequency, time_range[1] - time_range[0]): raise Exception( "Start & end period (%s to %s) does not cleanly fit a multiple of the model frequency (%s)" % (time_range[0], time_range[1], specs.frequency)) return pd.date_range(time_range[0], time_range[1], closed="left", freq=pd_frequency) # special cases: "train" or "test" - we have to calculate from model specs length_of_data = specs.end_of_testing - specs.start_of_training if time_range == "train": end_of_training = (specs.start_of_training + length_of_data * specs.ratio_training_testing_data) end_of_training = round_datetime(end_of_training, specs.frequency.total_seconds()) logger.debug("Start of training: %s" % specs.start_of_training) logger.debug("End of training: %s" % end_of_training) return pd.date_range(specs.start_of_training, end_of_training, freq=pd_frequency) elif time_range == "test": start_of_testing = ( specs.start_of_training + (length_of_data * specs.ratio_training_testing_data) + specs.frequency) start_of_testing = round_datetime(start_of_testing, specs.frequency.total_seconds()) logger.debug("Start of testing: %s" % start_of_testing) logger.debug("End of testing: %s" % specs.end_of_testing) return pd.date_range(start_of_testing, specs.end_of_testing, freq=pd_frequency)