def from_data_entry(cls, item: DataEntry, freq: Optional[str] = None) -> "TimeSeriesSlice": if freq is None: freq = item["start"].freq index = pd.period_range(start=item["start"], freq=freq, periods=len(item["target"])) feat_dynamic_cat = [ pd.Series(cat, index=index) for cat in list(item.get("feat_dynamic_cat", [])) ] feat_dynamic_real = [ pd.Series(real, index=index) for real in list(item.get("feat_dynamic_real", [])) ] feat_static_cat = list(item.get("feat_static_cat", [])) feat_static_real = list(item.get("feat_static_real", [])) return TimeSeriesSlice( target=pd.Series(item["target"], index=index), item=item[FieldName.ITEM_ID], feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, feat_dynamic_cat=feat_dynamic_cat, feat_dynamic_real=feat_dynamic_real, )
def _inject_nans_in_target(data_entry: DataEntry, p: float) -> DataEntry: """ Returns a copy of the given `data_entry` where approximately `p` percent of the target values are NaNs. Parameters ---------- data_entry The data entry to use as source. p The fraction of target positions to set to NaN (between 0 and 1). Returns ------- A copy of `data_entry` with modified target field. """ nan_positions = np.sort(a=np.random.choice( a=np.arange(data_entry["target"].size, dtype=int), size=int(p * data_entry["target"].size), replace=False, )) nan_target = np.copy(data_entry["target"]) nan_target[nan_positions] = np.nan # if p < 1.0 at the last position should be kept unchanged # otherwise for large p we might end up with NaNs in the last # context_length positions if p < 1.0: nan_target[-1] = data_entry["target"][-1] return { key: (nan_target if key == "target" else val) for key, val in data_entry.items() }
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]: ts_fields = self.dynamic_feature_fields + [self.target_field] ts_target = data[self.target_field] len_target = ts_target.shape[-1] if is_train: if len_target < self.instance_length: sampling_indices = ( # Returning [] for all time series will cause this to be in loop forever! [len_target] if self.allow_target_padding else []) else: sampling_indices = self.instance_sampler( ts_target, self.instance_length, len_target) else: sampling_indices = [len_target] for i in sampling_indices: d = data.copy() pad_length = max(self.instance_length - i, 0) # update start field d[self.start_field] = shift_timestamp(data[self.start_field], i - self.instance_length) # set is_pad field is_pad = np.zeros(self.instance_length) if pad_length > 0: is_pad[:pad_length] = 1 d[self.is_pad_field] = is_pad # update time series fields for ts_field in ts_fields: full_ts = data[ts_field] if pad_length > 0: pad_pre = self.pad_value * np.ones( shape=full_ts.shape[:-1] + (pad_length, )) past_ts = np.concatenate([pad_pre, full_ts[..., :i]], axis=-1) else: past_ts = full_ts[..., (i - self.instance_length):i] past_ts = past_ts.transpose() if self.output_NTC else past_ts d[self._past(ts_field)] = past_ts if self.use_prediction_features and not is_train: if not ts_field == self.target_field: future_ts = full_ts[..., i:i + self.prediction_length] future_ts = (future_ts.transpose() if self.output_NTC else future_ts) d[self._future(ts_field)] = future_ts del d[ts_field] d[self.forecast_start_field] = shift_timestamp( d[self.start_field], self.instance_length) yield d
def _make_prophet_data_entry(self, entry: DataEntry) -> ProphetDataEntry: """ Construct a :class:`ProphetDataEntry` from a regular :class:`DataEntry`. """ train_length = len(entry["target"]) prediction_length = self.prediction_length start = entry["start"] target = entry["target"] feat_dynamic_real = entry.get("feat_dynamic_real", []) # make sure each dynamic feature has the desired length for i, feature in enumerate(feat_dynamic_real): assert len(feature) == train_length + prediction_length, ( f"Length mismatch for dynamic real-valued feature #{i}: " f"expected {train_length + prediction_length}, " f"got {len(feature)}") return ProphetDataEntry( train_length=train_length, prediction_length=prediction_length, start=start, target=target, feat_dynamic_real=feat_dynamic_real, )
def predict_item(self, item: DataEntry) -> SampleForecast: return SampleForecast( samples=self.samples, start_date=item["start"], freq=self.freq, item_id=item.get(FieldName.ITEM_ID), )
def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry: start = data[self.start_field] length = target_transformation_length(data[self.target_field], self.pred_length, is_train=is_train) self._update_cache(start, length) i0 = self._date_index[start] date_idx = self._date_index.iloc[i0:i0 + length].index # When is_train is false, date_idx has len of target_len + prediction_len # which is useful in time feature generation, but we only need target length date_idx = date_idx[:len(data[self.target_field])] feature = pd.Series(np.ones(len(date_idx)) * np.nan, index=date_idx) mask = data[self.target_field] > 0 feature.loc[mask] = feature.loc[mask].index # filling in nan in first row with the corresponding date # Assumption: If the frame starts with a zero demand, earliest date in frame is taken as a start if len(feature) > 0: if pd.isnull(feature[0]): feature[0] = feature.index[0] feature = feature.ffill().to_frame() feature["diff"] = feature.index.to_period( feature.index.freqstr).astype(int) - pd.DatetimeIndex( feature.iloc[:, 0]).to_period( feature.index.freqstr).astype(int) feature["diff"] = feature["diff"].shift(1).round() + 1 feature["diff"] = feature["diff"].fillna(method="bfill") feature = feature["diff"].values if self.output_field in data.keys(): data[self.output_field] = np.vstack( [data[self.output_field], feature]) else: data[self.output_field] = feature return data
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: ts_fields = self.dynamic_feature_fields + [self.target_field] ts_target = data[self.target_field] sampling_indices = self.instance_sampler(ts_target) for i in sampling_indices: d = data.copy() pad_length = max(self.instance_length - i, 0) # update start field d[self.start_field] = ( data[self.start_field] + i - self.instance_length ) # set is_pad field is_pad = np.zeros(self.instance_length, dtype=ts_target.dtype) if pad_length > 0: is_pad[:pad_length] = 1 d[self.is_pad_field] = is_pad # update time series fields for ts_field in ts_fields: full_ts = data[ts_field] if pad_length > 0: pad_pre = self.pad_value * np.ones( shape=full_ts.shape[:-1] + (pad_length,) ) past_ts = np.concatenate( [pad_pre, full_ts[..., :i]], axis=-1 ) else: past_ts = full_ts[..., (i - self.instance_length) : i] past_ts = past_ts.transpose() if self.output_NTC else past_ts d[self._past(ts_field)] = past_ts if self.use_prediction_features: if not ts_field == self.target_field: future_ts = full_ts[ ..., i : i + self.prediction_length ] future_ts = ( future_ts.transpose() if self.output_NTC else future_ts ) d[self._future(ts_field)] = future_ts del d[ts_field] d[self.forecast_start_field] = ( d[self.start_field] + self.instance_length ) yield d
def predict_item(self, item: DataEntry) -> SampleForecast: samples_shape = self.num_samples, self.prediction_length samples = np.full(samples_shape, self.value) return SampleForecast( samples=samples, start_date=forecast_start(item), item_id=item.get("id"), )
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: pl = self.future_length lt = self.lead_time target = data[self.target_field] sampled_indices = self.instance_sampler(target) slice_cols = ( self.ts_fields + self.past_ts_fields + [self.target_field, self.observed_value_field] ) for i in sampled_indices: pad_length = max(self.past_length - i, 0) d = data.copy() for field in slice_cols: if i >= self.past_length: past_piece = d[field][..., i - self.past_length : i] else: pad_block = np.full( shape=d[field].shape[:-1] + (pad_length,), fill_value=self.dummy_value, dtype=d[field].dtype, ) past_piece = np.concatenate( [pad_block, d[field][..., :i]], axis=-1 ) future_piece = d[field][..., (i + lt) : (i + lt + pl)] if field in self.ts_fields: piece = np.concatenate([past_piece, future_piece], axis=-1) if self.output_NTC: piece = piece.transpose() d[field] = piece else: if self.output_NTC: past_piece = past_piece.transpose() future_piece = future_piece.transpose() if field not in self.past_ts_fields: d[self._past(field)] = past_piece d[self._future(field)] = future_piece del d[field] else: d[field] = past_piece pad_indicator = np.zeros(self.past_length) if pad_length > 0: pad_indicator[:pad_length] = 1 d[self._past(self.is_pad_field)] = pad_indicator d[self.forecast_start_field] = shift_timestamp( d[self.start_field], i + lt ) yield d
def predict_item(self, item: DataEntry) -> Forecast: prediction = item["target"][-self.prediction_length:] samples = np.broadcast_to( array=np.expand_dims(prediction, 0), shape=(self.num_samples, self.prediction_length), ) return SampleForecast( samples=samples, start_date=forecast_start(item), item_id=item.get(FieldName.ITEM_ID), )
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: pl = self.future_length lt = self.lead_time slice_cols = self.ts_fields + [self.target_field] target = data[self.target_field] sampled_indices = self.instance_sampler(target) for i in sampled_indices: pad_length = max(self.past_length - i, 0) d = data.copy() for ts_field in slice_cols: if i > self.past_length: # truncate to past_length past_piece = d[ts_field][..., i - self.past_length : i] elif i < self.past_length: pad_block = ( np.ones( d[ts_field].shape[:-1] + (pad_length,), dtype=d[ts_field].dtype, ) * self.dummy_value ) past_piece = np.concatenate( [pad_block, d[ts_field][..., :i]], axis=-1 ) else: past_piece = d[ts_field][..., :i] d[self._past(ts_field)] = past_piece d[self._future(ts_field)] = d[ts_field][ ..., i + lt : i + lt + pl ] del d[ts_field] pad_indicator = np.zeros(self.past_length, dtype=target.dtype) if pad_length > 0: pad_indicator[:pad_length] = 1 if self.output_NTC: for ts_field in slice_cols: d[self._past(ts_field)] = d[ self._past(ts_field) ].transpose() d[self._future(ts_field)] = d[ self._future(ts_field) ].transpose() d[self._past(self.is_pad_field)] = pad_indicator d[self.forecast_start_field] = d[self.start_field] + i + lt yield d
def predict_item(self, item: DataEntry) -> SampleForecast: if self.context_length is not None: target = item["target"][-self.context_length:] else: target = item["target"] mean = np.nanmean(target) std = np.nanstd(target) normal = np.random.standard_normal(self.shape) return SampleForecast( samples=std * normal + mean, start_date=forecast_start(item), item_id=item.get(FieldName.ITEM_ID), )
def predict_item(self, item: DataEntry) -> SampleForecast: target = item["target"].tolist() for _ in range(self.prediction_length): if self.context_length is not None: window = target[-self.context_length:] else: window = target target.append(np.nanmean(window)) return SampleForecast( samples=np.array([target[-self.prediction_length:]]), start_date=forecast_start(item), item_id=item.get(FieldName.ITEM_ID), )
def predict_item(self, item: DataEntry) -> Forecast: past_ts_data = item["target"] item_id = item.get("item_id", None) forecast_start_time = forecast_start(item) assert ( len(past_ts_data) >= 1 ), "all time series should have at least one data point" prediction = naive_2(past_ts_data, self.prediction_length, self.freq) samples = np.array([prediction]) return SampleForecast( samples=samples, start_date=forecast_start_time, item_id=item_id, )
def predict_item(self, item: DataEntry) -> Forecast: target = np.asarray(item["target"], np.float32) len_ts = len(target) forecast_start_time = forecast_start(item) assert (len_ts >= 1), "all time series should have at least one data point" if len_ts >= self.season_length: indices = [ len_ts - self.season_length + k % self.season_length for k in range(self.prediction_length) ] samples = target[indices].reshape((1, self.prediction_length)) else: samples = np.full(shape=(1, self.prediction_length), fill_value=target.mean()) return SampleForecast( samples=samples, start_date=forecast_start_time, item_id=item.get("item_id", None), )
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]: pl = self.future_length lt = self.lead_time target = data[self.target_field] len_target = target.shape[-1] minimum_length = (self.future_length if self.pick_incomplete else self.past_length + self.future_length) + self.lead_time if is_train: sampling_bounds = (( 0, len_target - self.future_length - self.lead_time, ) if self.pick_incomplete else ( self.past_length, len_target - self.future_length - self.lead_time, )) # We currently cannot handle time series that are # too short during training, so we just skip these. # If we want to include them we would need to pad and to # mask the loss. sampled_indices = (np.array([], dtype=int) if len_target < minimum_length else self.train_sampler(target, *sampling_bounds)) else: assert self.pick_incomplete or len_target >= self.past_length sampled_indices = np.array([len_target], dtype=int) slice_cols = (self.ts_fields + self.past_ts_fields + [self.target_field, self.observed_value_field]) for i in sampled_indices: pad_length = max(self.past_length - i, 0) if not self.pick_incomplete and pad_length > 0: raise RuntimeError( f"pad_length should be zero, got {pad_length}") d = data.copy() for field in slice_cols: if i >= self.past_length: past_piece = d[field][..., i - self.past_length:i] else: pad_block = (np.ones( d[field].shape[:-1] + (pad_length, ), dtype=d[field].dtype, ) * self.dummy_value) past_piece = np.concatenate([pad_block, d[field][..., :i]], axis=-1) future_piece = d[field][..., (i + lt):(i + lt + pl)] if field in self.ts_fields: piece = np.concatenate([past_piece, future_piece], axis=-1) if self.output_NTC: piece = piece.transpose() d[field] = piece else: if self.output_NTC: past_piece = past_piece.transpose() future_piece = future_piece.transpose() d[self._past(field)] = past_piece if field not in self.past_ts_fields: d[self._future(field)] = future_piece del d[field] pad_indicator = np.zeros(self.past_length) if pad_length > 0: pad_indicator[:pad_length] = 1 d[self._past(self.is_pad_field)] = pad_indicator d[self.forecast_start_field] = shift_timestamp( d[self.start_field], i + lt) yield d
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: target = data[self.target_field] if is_train: # We currently cannot handle time series that are shorter than the # prediction length during training, so we just skip these. # If we want to include them we would need to pad and to mask # the loss. if len(target) < self.dec_len: return sampling_indices = self.train_sampler( target, 0, len(target) - self.dec_len ) else: sampling_indices = [len(target)] # Loops over all encoder and decoder fields even those that are disabled to # set to dummy zero fields in those cases ts_fields_counter = Counter( set(self.encoder_series_fields + self.decoder_series_fields) ) for sampling_idx in sampling_indices: # ensure start index is not negative start_idx = max(0, sampling_idx - self.enc_len) # irrelevant data should have been removed by now in the # transformation chain, so copying everything is ok out = data.copy() for ts_field in list(ts_fields_counter.keys()): # target is 1d, this ensures ts is always 2d ts = np.atleast_2d(out[ts_field]).T if ts_fields_counter[ts_field] == 1: del out[ts_field] else: ts_fields_counter[ts_field] -= 1 # take enc_len values from ts, depending on sampling_idx slice = ts[start_idx:sampling_idx, :] ts_len = ts.shape[1] past_piece = np.zeros( shape=(self.enc_len, ts_len), dtype=ts.dtype ) if ts_field not in self.encoder_disabled_fields: # if we have less than enc_len values, pad_left with 0 past_piece = pad_to_size(slice, self.enc_len) out[self._past(ts_field)] = past_piece # exclude some fields at prediction time if ( not is_train and ts_field in self.prediction_time_decoder_exclude ): continue # This is were some of the forking magic happens: # For each of the encoder_len time-steps at which the decoder is applied we slice the # corresponding inputs called decoder_fields to the appropriate dec_len if ts_field in self.decoder_series_fields: forking_dec_field = np.zeros( shape=(self.num_forking, self.dec_len, ts_len), dtype=ts.dtype, ) # in case it's not disabled we copy the actual values if ts_field not in self.decoder_disabled_fields: # In case we sample and index too close to the beginning of the time series we would run out of # bounds (i.e. try to copy non existent time series data) to prepare the input for the decoder. # Instead of copying the partially available data from the time series and padding it with # zeros, we simply skip copying the partial data. Since copying data would result in overriding # the 0 pre-initialized 3D array, the end result of skipping is that the affected 2D decoder # inputs (entries of the 3D array - of which there are skip many) will still be all 0." skip = max(0, self.num_forking - sampling_idx) start_idx = max(0, sampling_idx - self.num_forking) # For 2D column-major (Fortran) ordering transposed array strides = (dtype, dtype*n_rows) # For standard row-major arrays, strides = (dtype*n_cols, dtype) stride = ts.strides forking_dec_field[skip:, :, :] = as_strided( ts[ start_idx + 1 : start_idx + 1 + self.num_forking - skip, :, ], shape=( self.num_forking - skip, self.dec_len, ts_len, ), # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) = # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) = # (1, n_rows, n_cols). Note since this array has been transposed, it is stored in # column-major (Fortan) ordering, i.e. for transposed data of shape (dim1, dim2, dim3), # strides = (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows). strides=stride[0:1] + stride, ) # edge case for prediction_length = 1 if forking_dec_field.shape[-1] == 1: out[self._future(ts_field)] = np.squeeze( forking_dec_field, axis=-1 ) else: out[self._future(ts_field)] = forking_dec_field # So far pad indicator not in use pad_indicator = np.zeros(self.enc_len) pad_length = max(0, self.enc_len - sampling_idx) pad_indicator[:pad_length] = True out[self._past(self.is_pad_out)] = pad_indicator # So far pad forecast_start not in use out[FieldName.FORECAST_START] = shift_timestamp( out[self.start_in], sampling_idx ) yield out
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]: target = data[self.target_field] if is_train: # We currently cannot handle time series that are shorter than the # prediction length during training, so we just skip these. # If we want to include them we would need to pad and to mask # the loss. if len(target) < self.dec_len: return sampling_indices = self.train_sampler(target, 0, len(target) - self.dec_len) else: sampling_indices = [len(target)] ts_fields_counter = Counter( set(self.encoder_series_fields + self.decoder_series_fields)) for sampling_idx in sampling_indices: # ensure start index is not negative start_idx = max(0, sampling_idx - self.enc_len) # irrelevant data should have been removed by now in the # transformation chain, so copying everything is ok out = data.copy() for ts_field in list(ts_fields_counter.keys()): # target is 1d, this ensures ts is always 2d ts = np.atleast_2d(out[ts_field]) if ts_fields_counter[ts_field] == 1: del out[ts_field] else: ts_fields_counter[ts_field] -= 1 # take enc_len values from ts, depending on sampling_idx slice = ts[:, start_idx:sampling_idx] # if we have less than enc_len values, pad_left with 0 past_piece = pad_to_size(slice, self.enc_len) out[self._past(ts_field)] = past_piece.transpose() # exclude some fields at prediction time if (not is_train and ts_field in self.prediction_time_decoder_exclude): continue # This is were some of the forking magic happens: # For each of the encoder_len time-steps at which the decoder is applied we slice the # corresponding inputs called decoder_fields to the appropriate dec_len if (ts_field in self.decoder_series_fields + self.decoder_disabled_fields): forking_dec_field = np.zeros(shape=(self.enc_len, self.dec_len, len(ts))) # in case it's not disabled we copy the actual values if ts_field not in self.decoder_disabled_fields: skip = max(0, self.enc_len - sampling_idx) # This section takes by far the longest time computationally: # This scales linearly in self.enc_len and linearly in self.dec_len for dec_field, idx in zip( forking_dec_field[skip:], range(start_idx + 1, start_idx + self.enc_len + 1), ): dec_field[:] = ts[:, idx:idx + self.dec_len].T if forking_dec_field.shape[-1] == 1: out[self._future(ts_field)] = np.squeeze( forking_dec_field, axis=-1) else: out[self._future(ts_field)] = forking_dec_field # So far pad indicator not in use pad_indicator = np.zeros(self.enc_len) pad_length = max(0, self.enc_len - sampling_idx) pad_indicator[:pad_length] = True out[self._past(self.is_pad_out)] = pad_indicator # So far pad forecast_start not in use out[FieldName.FORECAST_START] = shift_timestamp( out[self.start_in], sampling_idx) yield out
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]: target = data[self.target_field] sampled_indices = self.instance_sampler(target) ts_fields = set(self.encoder_series_fields + self.decoder_series_fields) for idx in sampled_indices: # irrelevant data should have been removed by now in the # transformation chain, so copying everything is ok out = data.copy() enc_len_diff = idx - self.enc_len dec_len_diff = idx - self.num_forking # ensure start indices are not negative start_idx_enc = max(0, enc_len_diff) start_idx_dec = max(0, dec_len_diff) # Define pad length indices for shorter time series of variable length being updated in place pad_length_enc = max(0, -enc_len_diff) pad_length_dec = max(0, -dec_len_diff) for ts_field in ts_fields: # target is 1d, this ensures ts is always 2d ts = np.atleast_2d(out[ts_field]).T ts_len = ts.shape[1] del out[ts_field] out[self._past(ts_field)] = np.zeros(shape=(self.enc_len, ts_len), dtype=ts.dtype) if ts_field not in self.encoder_disabled_fields: out[self._past(ts_field)][pad_length_enc:] = ts[ start_idx_enc:idx, :] if ts_field in self.decoder_series_fields: out[self._future(ts_field)] = np.zeros( shape=(self.num_forking, self.dec_len, ts_len), dtype=ts.dtype, ) if ts_field not in self.decoder_disabled_fields: # This is where some of the forking magic happens: # For each of the num_forking time-steps at which the decoder is applied we slice the # corresponding inputs called decoder_fields to the appropriate dec_len decoder_fields = ts[start_idx_dec + 1:idx + 1, :] # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed, # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows) stride = decoder_fields.strides out[self._future( ts_field )][pad_length_dec:] = as_strided( decoder_fields, shape=( self.num_forking - pad_length_dec, self.dec_len, ts_len, ), # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) = # (1, n_rows, n_cols). For transposed data, strides = # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows). strides=stride[0:1] + stride, ) # edge case for prediction_length = 1 if out[self._future(ts_field)].shape[-1] == 1: out[self._future(ts_field)] = np.squeeze( out[self._future(ts_field)], axis=-1) # So far encoder pad indicator not in use - # Marks that left padding for the encoder will occur on shorter time series pad_indicator = np.zeros(self.enc_len) pad_indicator[:pad_length_enc] = True out[self._past(self.is_pad_out)] = pad_indicator # So far pad forecast_start not in use out[FieldName.FORECAST_START] = shift_timestamp( out[self.start_in], idx) yield out
def transform(self, data: DataEntry) -> DataEntry: if self.output_field not in data.keys(): data[self.output_field] = self.value return data
def transform(self, data: DataEntry) -> DataEntry: for k in self.field_names: if k in data.keys(): del data[k] return data
def transform(self, data: DataEntry) -> DataEntry: return self.func(data.copy())
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: assert data[self.start_field].freq == data[self.end_field].freq total_interval_length = ( data[self.end_field] - data[self.start_field] ) / data[self.start_field].freq.delta # sample forecast start times in continuous time if is_train: if total_interval_length < ( self.future_interval_length + self.past_interval_length ): sampling_times: np.ndarray = np.array([]) else: sampling_times = self.train_sampler( self.past_interval_length, total_interval_length - self.future_interval_length, ) else: sampling_times = np.array([total_interval_length]) ia_times = data[self.target_field][0, :] marks = data[self.target_field][1:, :] ts = np.cumsum(ia_times) assert ts[-1] < total_interval_length, ( "Target interarrival times provided are inconsistent with " "start and end timestamps." ) # select field names that will be included in outputs keep_cols = { k: v for k, v in data.items() if k not in [self.target_field, self.start_field, self.end_field] } for future_start in sampling_times: r: DataEntry = dict() past_start = future_start - self.past_interval_length future_end = future_start + self.future_interval_length assert past_start >= 0 past_mask = self._mask_sorted(ts, past_start, future_start) past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[ np.newaxis ] r[f"past_{self.target_field}"] = np.concatenate( [past_ia_times, marks[:, past_mask]], axis=0 ).transpose() r["past_valid_length"] = np.array([len(past_mask)]) r[self.forecast_start_field] = ( data[self.start_field] + data[self.start_field].freq.delta * future_start ) if is_train: # include the future only if is_train assert future_end <= total_interval_length future_mask = self._mask_sorted(ts, future_start, future_end) future_ia_times = np.diff( np.r_[0, ts[future_mask] - future_start] )[np.newaxis] r[f"future_{self.target_field}"] = np.concatenate( [future_ia_times, marks[:, future_mask]], axis=0 ).transpose() r["future_valid_length"] = np.array([len(future_mask)]) # include other fields r.update(keep_cols.copy()) yield r
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: pl = self.future_length slice_cols = self.ts_fields + [self.target_field] target = data[self.target_field] len_target = target.shape[-1] if is_train: if len_target < self.future_length: # We currently cannot handle time series that are shorter than # the prediction length during training, so we just skip these. # If we want to include them we would need to pad and to mask # the loss. sampling_indices: List[int] = [] else: if self.pick_incomplete: sampling_indices = self.train_sampler( target, 0, len_target - self.future_length ) else: sampling_indices = self.train_sampler( target, self.past_length, len_target - self.future_length, ) else: sampling_indices = [len_target] for i in sampling_indices: pad_length = max(self.past_length - i, 0) if not self.pick_incomplete: assert pad_length == 0 d = data.copy() for ts_field in slice_cols: if i > self.past_length: # truncate to past_length past_piece = d[ts_field][..., i - self.past_length : i] elif i < self.past_length: pad_block = np.zeros( d[ts_field].shape[:-1] + (pad_length,), dtype=d[ts_field].dtype, ) past_piece = np.concatenate( [pad_block, d[ts_field][..., :i]], axis=-1 ) else: past_piece = d[ts_field][..., :i] d[self._past(ts_field)] = past_piece d[self._future(ts_field)] = d[ts_field][..., i : i + pl] del d[ts_field] pad_indicator = np.zeros(self.past_length) if pad_length > 0: pad_indicator[:pad_length] = 1 if self.output_NTC: for ts_field in slice_cols: d[self._past(ts_field)] = d[ self._past(ts_field) ].transpose() d[self._future(ts_field)] = d[ self._future(ts_field) ].transpose() d[self._past(self.is_pad_field)] = pad_indicator d[self.forecast_start_field] = shift_timestamp( d[self.start_field], i ) yield d
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: pl = self.future_length lt = self.lead_time slice_cols = self.ts_fields + [self.target_field] target = data[self.target_field] len_target = target.shape[-1] minimum_length = ( self.future_length if self.pick_incomplete else self.past_length + self.future_length ) + self.lead_time if is_train: sampling_bounds = ( ( 0, len_target - self.future_length - self.lead_time, ) # TODO: create parameter lower sampling bound for NBEATS if self.pick_incomplete else ( self.past_length, len_target - self.future_length - self.lead_time, ) ) # We currently cannot handle time series that are # too short during training, so we just skip these. # If we want to include them we would need to pad and to # mask the loss. sampled_indices = ( np.array([], dtype=int) if len_target < minimum_length else self.train_sampler(target, *sampling_bounds) ) else: assert self.pick_incomplete or len_target >= self.past_length sampled_indices = np.array([len_target], dtype=int) for i in sampled_indices: pad_length = max(self.past_length - i, 0) if not self.pick_incomplete: assert ( pad_length == 0 ), f"pad_length should be zero, got {pad_length}" d = data.copy() for ts_field in slice_cols: if i > self.past_length: # truncate to past_length past_piece = d[ts_field][..., i - self.past_length : i] elif i < self.past_length: pad_block = ( np.ones( d[ts_field].shape[:-1] + (pad_length,), dtype=d[ts_field].dtype, ) * self.dummy_value ) past_piece = np.concatenate( [pad_block, d[ts_field][..., :i]], axis=-1 ) else: past_piece = d[ts_field][..., :i] d[self._past(ts_field)] = past_piece d[self._future(ts_field)] = d[ts_field][ ..., i + lt : i + lt + pl ] del d[ts_field] pad_indicator = np.zeros(self.past_length) if pad_length > 0: pad_indicator[:pad_length] = 1 if self.output_NTC: for ts_field in slice_cols: d[self._past(ts_field)] = d[ self._past(ts_field) ].transpose() d[self._future(ts_field)] = d[ self._future(ts_field) ].transpose() d[self._past(self.is_pad_field)] = pad_indicator d[self.forecast_start_field] = shift_timestamp( d[self.start_field], i + lt ) yield d
def flatmap_transform(self, data: DataEntry, is_train: bool) -> Iterator[DataEntry]: target = data[self.target_field] if is_train: # We currently cannot handle time series that are shorter than the # prediction length during training, so we just skip these. # If we want to include them we would need to pad and to mask # the loss. if len(target) < self.dec_len: return sampling_indices = self.train_sampler(target, 0, len(target) - self.dec_len) else: sampling_indices = [len(target)] # Loops over all encoder and decoder fields even those that are disabled to # set to dummy zero fields in those cases ts_fields_counter = Counter( set(self.encoder_series_fields + self.decoder_series_fields)) for sampling_idx in sampling_indices: # irrelevant data should have been removed by now in the # transformation chain, so copying everything is ok out = data.copy() enc_len_diff = sampling_idx - self.enc_len dec_len_diff = sampling_idx - self.num_forking # ensure start indices are not negative start_idx_enc = max(0, enc_len_diff) start_idx_dec = max(0, dec_len_diff) # Define pad length indices for shorter time series of variable length being updated in place pad_length_enc = max(0, -enc_len_diff) pad_length_dec = max(0, -dec_len_diff) for ts_field in list(ts_fields_counter.keys()): # target is 1d, this ensures ts is always 2d ts = np.atleast_2d(out[ts_field]).T ts_len = ts.shape[1] if ts_fields_counter[ts_field] == 1: del out[ts_field] else: ts_fields_counter[ts_field] -= 1 out[self._past(ts_field)] = np.zeros(shape=(self.enc_len, ts_len), dtype=ts.dtype) if ts_field not in self.encoder_disabled_fields: out[self._past(ts_field)][pad_length_enc:] = ts[ start_idx_enc:sampling_idx, :] # exclude some fields at prediction time if (not is_train and ts_field in self.prediction_time_decoder_exclude): continue if ts_field in self.decoder_series_fields: out[self._future(ts_field)] = np.zeros( shape=(self.num_forking, self.dec_len, ts_len), dtype=ts.dtype, ) if ts_field not in self.decoder_disabled_fields: # This is where some of the forking magic happens: # For each of the num_forking time-steps at which the decoder is applied we slice the # corresponding inputs called decoder_fields to the appropriate dec_len decoder_fields = ts[start_idx_dec + 1:sampling_idx + 1, :] # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed, # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows) stride = decoder_fields.strides out[self._future( ts_field )][pad_length_dec:] = as_strided( decoder_fields, shape=( self.num_forking - pad_length_dec, self.dec_len, ts_len, ), # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) = # (1, n_rows, n_cols). For transposed data, strides = # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows). strides=stride[0:1] + stride, ) # edge case for prediction_length = 1 if out[self._future(ts_field)].shape[-1] == 1: out[self._future(ts_field)] = np.squeeze( out[self._future(ts_field)], axis=-1) # So far encoder pad indicator not in use - # Marks that left padding for the encoder will occur on shorter time series pad_indicator = np.zeros(self.enc_len) pad_indicator[:pad_length_enc] = True out[self._past(self.is_pad_out)] = pad_indicator # So far pad forecast_start not in use out[FieldName.FORECAST_START] = shift_timestamp( out[self.start_in], sampling_idx) yield out
def transform(self, data: DataEntry) -> DataEntry: for k in self.field_names: data.pop(k, None) return data
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: dec_len = self.dec_len slice_cols = self.ts_fields + [self.target_in] target = data[self.target_in] if is_train: if len(target) < self.dec_len: # We currently cannot handle time series that are shorter than the # prediction length during training, so we just skip these. # If we want to include them we would need to pad and to mask # the loss. sampling_indices: List[int] = [] else: sampling_indices = self.train_sampler( target, 0, len(target) - self.dec_len ) else: sampling_indices = [len(target)] for i in sampling_indices: pad_length = max(self.enc_len - i, 0) d = data.copy() for ts_field in slice_cols: if i > self.enc_len: # truncate to past_length past_piece = d[ts_field][..., i - self.enc_len : i] elif i < self.enc_len: pad_block = np.zeros( d[ts_field].shape[:-1] + (pad_length,) ) past_piece = np.concatenate( [pad_block, d[ts_field][..., :i]], axis=-1 ) else: past_piece = d[ts_field][..., :i] d[self._past(ts_field)] = np.expand_dims(past_piece, -1) if is_train and ts_field is self.target_in: forking_dec_field = np.zeros( shape=(self.enc_len, self.dec_len) ) for j in range(self.enc_len): start_idx = i - self.enc_len + j + 1 if start_idx >= 0: forking_dec_field[j, :] = d[ts_field][ ..., start_idx : start_idx + dec_len ] d[self._future(ts_field)] = forking_dec_field del d[ts_field] pad_indicator = np.zeros(self.enc_len) if pad_length > 0: pad_indicator[:pad_length] = 1 d[self._past(self.is_pad_out)] = pad_indicator d[self.forecast_start_out] = shift_timestamp(d[self.start_in], i) yield d