예제 #1
0
    def from_data_entry(cls,
                        item: DataEntry,
                        freq: Optional[str] = None) -> "TimeSeriesSlice":
        if freq is None:
            freq = item["start"].freq

        index = pd.period_range(start=item["start"],
                                freq=freq,
                                periods=len(item["target"]))

        feat_dynamic_cat = [
            pd.Series(cat, index=index)
            for cat in list(item.get("feat_dynamic_cat", []))
        ]

        feat_dynamic_real = [
            pd.Series(real, index=index)
            for real in list(item.get("feat_dynamic_real", []))
        ]

        feat_static_cat = list(item.get("feat_static_cat", []))

        feat_static_real = list(item.get("feat_static_real", []))

        return TimeSeriesSlice(
            target=pd.Series(item["target"], index=index),
            item=item[FieldName.ITEM_ID],
            feat_static_cat=feat_static_cat,
            feat_static_real=feat_static_real,
            feat_dynamic_cat=feat_dynamic_cat,
            feat_dynamic_real=feat_dynamic_real,
        )
예제 #2
0
def _inject_nans_in_target(data_entry: DataEntry, p: float) -> DataEntry:
    """
    Returns a copy of the given `data_entry` where approximately `p` percent
    of the target values are NaNs.

    Parameters
    ----------
    data_entry
        The data entry to use as source.
    p
        The fraction of target positions to set to NaN (between 0 and 1).

    Returns
    -------
        A copy of `data_entry` with modified target field.
    """
    nan_positions = np.sort(a=np.random.choice(
        a=np.arange(data_entry["target"].size, dtype=int),
        size=int(p * data_entry["target"].size),
        replace=False,
    ))

    nan_target = np.copy(data_entry["target"])
    nan_target[nan_positions] = np.nan

    # if p < 1.0 at the last position should be kept unchanged
    # otherwise for large p we might end up with NaNs in the last
    # context_length positions
    if p < 1.0:
        nan_target[-1] = data_entry["target"][-1]

    return {
        key: (nan_target if key == "target" else val)
        for key, val in data_entry.items()
    }
예제 #3
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        ts_fields = self.dynamic_feature_fields + [self.target_field]
        ts_target = data[self.target_field]

        len_target = ts_target.shape[-1]

        if is_train:
            if len_target < self.instance_length:
                sampling_indices = (
                    # Returning [] for all time series will cause this to be in loop forever!
                    [len_target] if self.allow_target_padding else [])
            else:
                sampling_indices = self.instance_sampler(
                    ts_target, self.instance_length, len_target)
        else:
            sampling_indices = [len_target]

        for i in sampling_indices:
            d = data.copy()

            pad_length = max(self.instance_length - i, 0)

            # update start field
            d[self.start_field] = shift_timestamp(data[self.start_field],
                                                  i - self.instance_length)

            # set is_pad field
            is_pad = np.zeros(self.instance_length)
            if pad_length > 0:
                is_pad[:pad_length] = 1
            d[self.is_pad_field] = is_pad

            # update time series fields
            for ts_field in ts_fields:
                full_ts = data[ts_field]
                if pad_length > 0:
                    pad_pre = self.pad_value * np.ones(
                        shape=full_ts.shape[:-1] + (pad_length, ))
                    past_ts = np.concatenate([pad_pre, full_ts[..., :i]],
                                             axis=-1)
                else:
                    past_ts = full_ts[..., (i - self.instance_length):i]

                past_ts = past_ts.transpose() if self.output_NTC else past_ts
                d[self._past(ts_field)] = past_ts

                if self.use_prediction_features and not is_train:
                    if not ts_field == self.target_field:
                        future_ts = full_ts[..., i:i + self.prediction_length]
                        future_ts = (future_ts.transpose()
                                     if self.output_NTC else future_ts)
                        d[self._future(ts_field)] = future_ts

                del d[ts_field]

            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], self.instance_length)

            yield d
예제 #4
0
    def _make_prophet_data_entry(self, entry: DataEntry) -> ProphetDataEntry:
        """
        Construct a :class:`ProphetDataEntry` from a regular
        :class:`DataEntry`.
        """

        train_length = len(entry["target"])
        prediction_length = self.prediction_length
        start = entry["start"]
        target = entry["target"]
        feat_dynamic_real = entry.get("feat_dynamic_real", [])

        # make sure each dynamic feature has the desired length
        for i, feature in enumerate(feat_dynamic_real):
            assert len(feature) == train_length + prediction_length, (
                f"Length mismatch for dynamic real-valued feature #{i}: "
                f"expected {train_length + prediction_length}, "
                f"got {len(feature)}")

        return ProphetDataEntry(
            train_length=train_length,
            prediction_length=prediction_length,
            start=start,
            target=target,
            feat_dynamic_real=feat_dynamic_real,
        )
예제 #5
0
 def predict_item(self, item: DataEntry) -> SampleForecast:
     return SampleForecast(
         samples=self.samples,
         start_date=item["start"],
         freq=self.freq,
         item_id=item.get(FieldName.ITEM_ID),
     )
예제 #6
0
 def map_transform(self, data: DataEntry, is_train: bool) -> DataEntry:
     start = data[self.start_field]
     length = target_transformation_length(data[self.target_field],
                                           self.pred_length,
                                           is_train=is_train)
     self._update_cache(start, length)
     i0 = self._date_index[start]
     date_idx = self._date_index.iloc[i0:i0 + length].index
     # When is_train is false, date_idx has len of target_len + prediction_len
     # which is useful in time feature generation, but we only need target length
     date_idx = date_idx[:len(data[self.target_field])]
     feature = pd.Series(np.ones(len(date_idx)) * np.nan, index=date_idx)
     mask = data[self.target_field] > 0
     feature.loc[mask] = feature.loc[mask].index
     # filling in nan in first row with the corresponding date
     # Assumption: If the frame starts with a zero demand, earliest date in frame is taken as a start
     if len(feature) > 0:
         if pd.isnull(feature[0]):
             feature[0] = feature.index[0]
     feature = feature.ffill().to_frame()
     feature["diff"] = feature.index.to_period(
         feature.index.freqstr).astype(int) - pd.DatetimeIndex(
             feature.iloc[:, 0]).to_period(
                 feature.index.freqstr).astype(int)
     feature["diff"] = feature["diff"].shift(1).round() + 1
     feature["diff"] = feature["diff"].fillna(method="bfill")
     feature = feature["diff"].values
     if self.output_field in data.keys():
         data[self.output_field] = np.vstack(
             [data[self.output_field], feature])
     else:
         data[self.output_field] = feature
     return data
예제 #7
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        ts_fields = self.dynamic_feature_fields + [self.target_field]
        ts_target = data[self.target_field]

        sampling_indices = self.instance_sampler(ts_target)

        for i in sampling_indices:
            d = data.copy()

            pad_length = max(self.instance_length - i, 0)

            # update start field
            d[self.start_field] = (
                data[self.start_field] + i - self.instance_length
            )

            # set is_pad field
            is_pad = np.zeros(self.instance_length, dtype=ts_target.dtype)
            if pad_length > 0:
                is_pad[:pad_length] = 1
            d[self.is_pad_field] = is_pad

            # update time series fields
            for ts_field in ts_fields:
                full_ts = data[ts_field]
                if pad_length > 0:
                    pad_pre = self.pad_value * np.ones(
                        shape=full_ts.shape[:-1] + (pad_length,)
                    )
                    past_ts = np.concatenate(
                        [pad_pre, full_ts[..., :i]], axis=-1
                    )
                else:
                    past_ts = full_ts[..., (i - self.instance_length) : i]

                past_ts = past_ts.transpose() if self.output_NTC else past_ts
                d[self._past(ts_field)] = past_ts

                if self.use_prediction_features:
                    if not ts_field == self.target_field:
                        future_ts = full_ts[
                            ..., i : i + self.prediction_length
                        ]
                        future_ts = (
                            future_ts.transpose()
                            if self.output_NTC
                            else future_ts
                        )
                        d[self._future(ts_field)] = future_ts

                del d[ts_field]

            d[self.forecast_start_field] = (
                d[self.start_field] + self.instance_length
            )

            yield d
예제 #8
0
 def predict_item(self, item: DataEntry) -> SampleForecast:
     samples_shape = self.num_samples, self.prediction_length
     samples = np.full(samples_shape, self.value)
     return SampleForecast(
         samples=samples,
         start_date=forecast_start(item),
         item_id=item.get("id"),
     )
예제 #9
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        target = data[self.target_field]

        sampled_indices = self.instance_sampler(target)

        slice_cols = (
            self.ts_fields
            + self.past_ts_fields
            + [self.target_field, self.observed_value_field]
        )
        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            d = data.copy()

            for field in slice_cols:
                if i >= self.past_length:
                    past_piece = d[field][..., i - self.past_length : i]
                else:
                    pad_block = np.full(
                        shape=d[field].shape[:-1] + (pad_length,),
                        fill_value=self.dummy_value,
                        dtype=d[field].dtype,
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[field][..., :i]], axis=-1
                    )
                future_piece = d[field][..., (i + lt) : (i + lt + pl)]
                if field in self.ts_fields:
                    piece = np.concatenate([past_piece, future_piece], axis=-1)
                    if self.output_NTC:
                        piece = piece.transpose()
                    d[field] = piece
                else:
                    if self.output_NTC:
                        past_piece = past_piece.transpose()
                        future_piece = future_piece.transpose()
                    if field not in self.past_ts_fields:
                        d[self._past(field)] = past_piece
                        d[self._future(field)] = future_piece
                        del d[field]
                    else:
                        d[field] = past_piece
            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i + lt
            )
            yield d
예제 #10
0
    def predict_item(self, item: DataEntry) -> Forecast:
        prediction = item["target"][-self.prediction_length:]
        samples = np.broadcast_to(
            array=np.expand_dims(prediction, 0),
            shape=(self.num_samples, self.prediction_length),
        )

        return SampleForecast(
            samples=samples,
            start_date=forecast_start(item),
            item_id=item.get(FieldName.ITEM_ID),
        )
예제 #11
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        slice_cols = self.ts_fields + [self.target_field]
        target = data[self.target_field]

        sampled_indices = self.instance_sampler(target)

        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            d = data.copy()
            for ts_field in slice_cols:
                if i > self.past_length:
                    # truncate to past_length
                    past_piece = d[ts_field][..., i - self.past_length : i]
                elif i < self.past_length:
                    pad_block = (
                        np.ones(
                            d[ts_field].shape[:-1] + (pad_length,),
                            dtype=d[ts_field].dtype,
                        )
                        * self.dummy_value
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[ts_field][..., :i]], axis=-1
                    )
                else:
                    past_piece = d[ts_field][..., :i]
                d[self._past(ts_field)] = past_piece
                d[self._future(ts_field)] = d[ts_field][
                    ..., i + lt : i + lt + pl
                ]
                del d[ts_field]
            pad_indicator = np.zeros(self.past_length, dtype=target.dtype)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1

            if self.output_NTC:
                for ts_field in slice_cols:
                    d[self._past(ts_field)] = d[
                        self._past(ts_field)
                    ].transpose()
                    d[self._future(ts_field)] = d[
                        self._future(ts_field)
                    ].transpose()

            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = d[self.start_field] + i + lt
            yield d
예제 #12
0
파일: mean.py 프로젝트: RomaKoks/gluon-ts
    def predict_item(self, item: DataEntry) -> SampleForecast:
        if self.context_length is not None:
            target = item["target"][-self.context_length:]
        else:
            target = item["target"]

        mean = np.nanmean(target)
        std = np.nanstd(target)
        normal = np.random.standard_normal(self.shape)

        return SampleForecast(
            samples=std * normal + mean,
            start_date=forecast_start(item),
            item_id=item.get(FieldName.ITEM_ID),
        )
예제 #13
0
파일: mean.py 프로젝트: RomaKoks/gluon-ts
    def predict_item(self, item: DataEntry) -> SampleForecast:
        target = item["target"].tolist()

        for _ in range(self.prediction_length):
            if self.context_length is not None:
                window = target[-self.context_length:]
            else:
                window = target

            target.append(np.nanmean(window))

        return SampleForecast(
            samples=np.array([target[-self.prediction_length:]]),
            start_date=forecast_start(item),
            item_id=item.get(FieldName.ITEM_ID),
        )
예제 #14
0
    def predict_item(self, item: DataEntry) -> Forecast:
        past_ts_data = item["target"]
        item_id = item.get("item_id", None)
        forecast_start_time = forecast_start(item)

        assert (
            len(past_ts_data) >= 1
        ), "all time series should have at least one data point"

        prediction = naive_2(past_ts_data, self.prediction_length, self.freq)

        samples = np.array([prediction])

        return SampleForecast(
            samples=samples,
            start_date=forecast_start_time,
            item_id=item_id,
        )
예제 #15
0
    def predict_item(self, item: DataEntry) -> Forecast:
        target = np.asarray(item["target"], np.float32)
        len_ts = len(target)
        forecast_start_time = forecast_start(item)

        assert (len_ts >=
                1), "all time series should have at least one data point"

        if len_ts >= self.season_length:
            indices = [
                len_ts - self.season_length + k % self.season_length
                for k in range(self.prediction_length)
            ]
            samples = target[indices].reshape((1, self.prediction_length))
        else:
            samples = np.full(shape=(1, self.prediction_length),
                              fill_value=target.mean())

        return SampleForecast(
            samples=samples,
            start_date=forecast_start_time,
            item_id=item.get("item_id", None),
        )
예제 #16
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        target = data[self.target_field]
        len_target = target.shape[-1]

        minimum_length = (self.future_length
                          if self.pick_incomplete else self.past_length +
                          self.future_length) + self.lead_time

        if is_train:
            sampling_bounds = ((
                0,
                len_target - self.future_length - self.lead_time,
            ) if self.pick_incomplete else (
                self.past_length,
                len_target - self.future_length - self.lead_time,
            ))

            # We currently cannot handle time series that are
            # too short during training, so we just skip these.
            # If we want to include them we would need to pad and to
            # mask the loss.
            sampled_indices = (np.array([], dtype=int)
                               if len_target < minimum_length else
                               self.train_sampler(target, *sampling_bounds))
        else:
            assert self.pick_incomplete or len_target >= self.past_length
            sampled_indices = np.array([len_target], dtype=int)

        slice_cols = (self.ts_fields + self.past_ts_fields +
                      [self.target_field, self.observed_value_field])
        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            if not self.pick_incomplete and pad_length > 0:
                raise RuntimeError(
                    f"pad_length should be zero, got {pad_length}")
            d = data.copy()

            for field in slice_cols:
                if i >= self.past_length:
                    past_piece = d[field][..., i - self.past_length:i]
                else:
                    pad_block = (np.ones(
                        d[field].shape[:-1] + (pad_length, ),
                        dtype=d[field].dtype,
                    ) * self.dummy_value)
                    past_piece = np.concatenate([pad_block, d[field][..., :i]],
                                                axis=-1)
                future_piece = d[field][..., (i + lt):(i + lt + pl)]
                if field in self.ts_fields:
                    piece = np.concatenate([past_piece, future_piece], axis=-1)
                    if self.output_NTC:
                        piece = piece.transpose()
                    d[field] = piece
                else:
                    if self.output_NTC:
                        past_piece = past_piece.transpose()
                        future_piece = future_piece.transpose()
                    d[self._past(field)] = past_piece
                    if field not in self.past_ts_fields:
                        d[self._future(field)] = future_piece
                    del d[field]

            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i + lt)
            yield d
예제 #17
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(
                target, 0, len(target) - self.dec_len
            )
        else:
            sampling_indices = [len(target)]

        # Loops over all encoder and decoder fields even those that are disabled to
        # set to dummy zero fields in those cases
        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields)
        )

        for sampling_idx in sampling_indices:
            # ensure start index is not negative
            start_idx = max(0, sampling_idx - self.enc_len)

            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                # take enc_len values from ts, depending on sampling_idx
                slice = ts[start_idx:sampling_idx, :]

                ts_len = ts.shape[1]
                past_piece = np.zeros(
                    shape=(self.enc_len, ts_len), dtype=ts.dtype
                )

                if ts_field not in self.encoder_disabled_fields:
                    # if we have less than enc_len values, pad_left with 0
                    past_piece = pad_to_size(slice, self.enc_len)
                out[self._past(ts_field)] = past_piece

                # exclude some fields at prediction time
                if (
                    not is_train
                    and ts_field in self.prediction_time_decoder_exclude
                ):
                    continue

                # This is were some of the forking magic happens:
                # For each of the encoder_len time-steps at which the decoder is applied we slice the
                # corresponding inputs called decoder_fields to the appropriate dec_len
                if ts_field in self.decoder_series_fields:

                    forking_dec_field = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    # in case it's not disabled we copy the actual values
                    if ts_field not in self.decoder_disabled_fields:
                        # In case we sample and index too close to the beginning of the time series we would run out of
                        # bounds (i.e. try to copy non existent time series data) to prepare the input for the decoder.
                        # Instead of copying the partially available data from the time series and padding it with
                        # zeros, we simply skip copying the partial data. Since copying data would result in overriding
                        # the 0 pre-initialized 3D array, the end result of skipping is that the affected 2D decoder
                        # inputs (entries of the 3D array - of which there are skip many) will still be all 0."
                        skip = max(0, self.num_forking - sampling_idx)
                        start_idx = max(0, sampling_idx - self.num_forking)
                        # For 2D column-major (Fortran) ordering transposed array strides = (dtype, dtype*n_rows)
                        # For standard row-major arrays, strides = (dtype*n_cols, dtype)
                        stride = ts.strides
                        forking_dec_field[skip:, :, :] = as_strided(
                            ts[
                                start_idx
                                + 1 : start_idx
                                + 1
                                + self.num_forking
                                - skip,
                                :,
                            ],
                            shape=(
                                self.num_forking - skip,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  Note since this array has been transposed, it is stored in
                            # column-major (Fortan) ordering, i.e. for transposed data of shape (dim1, dim2, dim3),
                            # strides = (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )
                    # edge case for prediction_length = 1
                    if forking_dec_field.shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            forking_dec_field, axis=-1
                        )
                    else:
                        out[self._future(ts_field)] = forking_dec_field

            # So far pad indicator not in use
            pad_indicator = np.zeros(self.enc_len)
            pad_length = max(0, self.enc_len - sampling_idx)
            pad_indicator[:pad_length] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx
            )

            yield out
예제 #18
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(target, 0,
                                                  len(target) - self.dec_len)
        else:
            sampling_indices = [len(target)]

        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields))

        for sampling_idx in sampling_indices:
            # ensure start index is not negative
            start_idx = max(0, sampling_idx - self.enc_len)

            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field])

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                # take enc_len values from ts, depending on sampling_idx
                slice = ts[:, start_idx:sampling_idx]

                # if we have less than enc_len values, pad_left with 0
                past_piece = pad_to_size(slice, self.enc_len)

                out[self._past(ts_field)] = past_piece.transpose()

                # exclude some fields at prediction time
                if (not is_train
                        and ts_field in self.prediction_time_decoder_exclude):
                    continue

                # This is were some of the forking magic happens:
                # For each of the encoder_len time-steps at which the decoder is applied we slice the
                # corresponding inputs called decoder_fields to the appropriate dec_len
                if (ts_field in self.decoder_series_fields +
                        self.decoder_disabled_fields):
                    forking_dec_field = np.zeros(shape=(self.enc_len,
                                                        self.dec_len, len(ts)))

                    # in case it's not disabled we copy the actual values
                    if ts_field not in self.decoder_disabled_fields:
                        skip = max(0, self.enc_len - sampling_idx)
                        # This section takes by far the longest time computationally:
                        # This scales linearly in self.enc_len and linearly in self.dec_len
                        for dec_field, idx in zip(
                                forking_dec_field[skip:],
                                range(start_idx + 1,
                                      start_idx + self.enc_len + 1),
                        ):
                            dec_field[:] = ts[:, idx:idx + self.dec_len].T

                    if forking_dec_field.shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            forking_dec_field, axis=-1)
                    else:
                        out[self._future(ts_field)] = forking_dec_field

            # So far pad indicator not in use
            pad_indicator = np.zeros(self.enc_len)
            pad_length = max(0, self.enc_len - sampling_idx)
            pad_indicator[:pad_length] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx)

            yield out
예제 #19
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        sampled_indices = self.instance_sampler(target)

        ts_fields = set(self.encoder_series_fields +
                        self.decoder_series_fields)

        for idx in sampled_indices:
            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            enc_len_diff = idx - self.enc_len
            dec_len_diff = idx - self.num_forking

            # ensure start indices are not negative
            start_idx_enc = max(0, enc_len_diff)
            start_idx_dec = max(0, dec_len_diff)

            # Define pad length indices for shorter time series of variable length being updated in place
            pad_length_enc = max(0, -enc_len_diff)
            pad_length_dec = max(0, -dec_len_diff)

            for ts_field in ts_fields:

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T
                ts_len = ts.shape[1]

                del out[ts_field]

                out[self._past(ts_field)] = np.zeros(shape=(self.enc_len,
                                                            ts_len),
                                                     dtype=ts.dtype)
                if ts_field not in self.encoder_disabled_fields:
                    out[self._past(ts_field)][pad_length_enc:] = ts[
                        start_idx_enc:idx, :]

                if ts_field in self.decoder_series_fields:
                    out[self._future(ts_field)] = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    if ts_field not in self.decoder_disabled_fields:
                        # This is where some of the forking magic happens:
                        # For each of the num_forking time-steps at which the decoder is applied we slice the
                        # corresponding inputs called decoder_fields to the appropriate dec_len
                        decoder_fields = ts[start_idx_dec + 1:idx + 1, :]
                        # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed,
                        # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows)
                        stride = decoder_fields.strides
                        out[self._future(
                            ts_field
                        )][pad_length_dec:] = as_strided(
                            decoder_fields,
                            shape=(
                                self.num_forking - pad_length_dec,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  For transposed data, strides =
                            # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )

                    # edge case for prediction_length = 1
                    if out[self._future(ts_field)].shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            out[self._future(ts_field)], axis=-1)

            # So far encoder pad indicator not in use -
            # Marks that left padding for the encoder will occur on shorter time series
            pad_indicator = np.zeros(self.enc_len)
            pad_indicator[:pad_length_enc] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], idx)

            yield out
예제 #20
0
 def transform(self, data: DataEntry) -> DataEntry:
     if self.output_field not in data.keys():
         data[self.output_field] = self.value
     return data
예제 #21
0
 def transform(self, data: DataEntry) -> DataEntry:
     for k in self.field_names:
         if k in data.keys():
             del data[k]
     return data
예제 #22
0
 def transform(self, data: DataEntry) -> DataEntry:
     return self.func(data.copy())
예제 #23
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:

        assert data[self.start_field].freq == data[self.end_field].freq

        total_interval_length = (
            data[self.end_field] - data[self.start_field]
        ) / data[self.start_field].freq.delta

        # sample forecast start times in continuous time
        if is_train:
            if total_interval_length < (
                self.future_interval_length + self.past_interval_length
            ):
                sampling_times: np.ndarray = np.array([])
            else:
                sampling_times = self.train_sampler(
                    self.past_interval_length,
                    total_interval_length - self.future_interval_length,
                )
        else:
            sampling_times = np.array([total_interval_length])

        ia_times = data[self.target_field][0, :]
        marks = data[self.target_field][1:, :]

        ts = np.cumsum(ia_times)
        assert ts[-1] < total_interval_length, (
            "Target interarrival times provided are inconsistent with "
            "start and end timestamps."
        )

        # select field names that will be included in outputs
        keep_cols = {
            k: v
            for k, v in data.items()
            if k not in [self.target_field, self.start_field, self.end_field]
        }

        for future_start in sampling_times:

            r: DataEntry = dict()

            past_start = future_start - self.past_interval_length
            future_end = future_start + self.future_interval_length

            assert past_start >= 0

            past_mask = self._mask_sorted(ts, past_start, future_start)

            past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[
                np.newaxis
            ]

            r[f"past_{self.target_field}"] = np.concatenate(
                [past_ia_times, marks[:, past_mask]], axis=0
            ).transpose()

            r["past_valid_length"] = np.array([len(past_mask)])

            r[self.forecast_start_field] = (
                data[self.start_field]
                + data[self.start_field].freq.delta * future_start
            )

            if is_train:  # include the future only if is_train
                assert future_end <= total_interval_length

                future_mask = self._mask_sorted(ts, future_start, future_end)

                future_ia_times = np.diff(
                    np.r_[0, ts[future_mask] - future_start]
                )[np.newaxis]

                r[f"future_{self.target_field}"] = np.concatenate(
                    [future_ia_times, marks[:, future_mask]], axis=0
                ).transpose()

                r["future_valid_length"] = np.array([len(future_mask)])

            # include other fields
            r.update(keep_cols.copy())

            yield r
예제 #24
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        pl = self.future_length
        slice_cols = self.ts_fields + [self.target_field]
        target = data[self.target_field]

        len_target = target.shape[-1]

        if is_train:
            if len_target < self.future_length:
                # We currently cannot handle time series that are shorter than
                # the prediction length during training, so we just skip these.
                # If we want to include them we would need to pad and to mask
                # the loss.
                sampling_indices: List[int] = []
            else:
                if self.pick_incomplete:
                    sampling_indices = self.train_sampler(
                        target, 0, len_target - self.future_length
                    )
                else:
                    sampling_indices = self.train_sampler(
                        target,
                        self.past_length,
                        len_target - self.future_length,
                    )
        else:
            sampling_indices = [len_target]
        for i in sampling_indices:
            pad_length = max(self.past_length - i, 0)
            if not self.pick_incomplete:
                assert pad_length == 0
            d = data.copy()
            for ts_field in slice_cols:
                if i > self.past_length:
                    # truncate to past_length
                    past_piece = d[ts_field][..., i - self.past_length : i]
                elif i < self.past_length:
                    pad_block = np.zeros(
                        d[ts_field].shape[:-1] + (pad_length,),
                        dtype=d[ts_field].dtype,
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[ts_field][..., :i]], axis=-1
                    )
                else:
                    past_piece = d[ts_field][..., :i]
                d[self._past(ts_field)] = past_piece
                d[self._future(ts_field)] = d[ts_field][..., i : i + pl]
                del d[ts_field]
            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1

            if self.output_NTC:
                for ts_field in slice_cols:
                    d[self._past(ts_field)] = d[
                        self._past(ts_field)
                    ].transpose()
                    d[self._future(ts_field)] = d[
                        self._future(ts_field)
                    ].transpose()

            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i
            )
            yield d
예제 #25
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        slice_cols = self.ts_fields + [self.target_field]
        target = data[self.target_field]

        len_target = target.shape[-1]

        minimum_length = (
            self.future_length
            if self.pick_incomplete
            else self.past_length + self.future_length
        ) + self.lead_time

        if is_train:
            sampling_bounds = (
                (
                    0,
                    len_target - self.future_length - self.lead_time,
                )  # TODO: create parameter lower sampling bound for NBEATS
                if self.pick_incomplete
                else (
                    self.past_length,
                    len_target - self.future_length - self.lead_time,
                )
            )

            # We currently cannot handle time series that are
            # too short during training, so we just skip these.
            # If we want to include them we would need to pad and to
            # mask the loss.
            sampled_indices = (
                np.array([], dtype=int)
                if len_target < minimum_length
                else self.train_sampler(target, *sampling_bounds)
            )
        else:
            assert self.pick_incomplete or len_target >= self.past_length
            sampled_indices = np.array([len_target], dtype=int)
        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            if not self.pick_incomplete:
                assert (
                    pad_length == 0
                ), f"pad_length should be zero, got {pad_length}"
            d = data.copy()
            for ts_field in slice_cols:
                if i > self.past_length:
                    # truncate to past_length
                    past_piece = d[ts_field][..., i - self.past_length : i]
                elif i < self.past_length:
                    pad_block = (
                        np.ones(
                            d[ts_field].shape[:-1] + (pad_length,),
                            dtype=d[ts_field].dtype,
                        )
                        * self.dummy_value
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[ts_field][..., :i]], axis=-1
                    )
                else:
                    past_piece = d[ts_field][..., :i]
                d[self._past(ts_field)] = past_piece
                d[self._future(ts_field)] = d[ts_field][
                    ..., i + lt : i + lt + pl
                ]
                del d[ts_field]
            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1

            if self.output_NTC:
                for ts_field in slice_cols:
                    d[self._past(ts_field)] = d[
                        self._past(ts_field)
                    ].transpose()
                    d[self._future(ts_field)] = d[
                        self._future(ts_field)
                    ].transpose()

            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i + lt
            )
            yield d
예제 #26
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(target, 0,
                                                  len(target) - self.dec_len)
        else:
            sampling_indices = [len(target)]

        # Loops over all encoder and decoder fields even those that are disabled to
        # set to dummy zero fields in those cases
        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields))

        for sampling_idx in sampling_indices:
            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            enc_len_diff = sampling_idx - self.enc_len
            dec_len_diff = sampling_idx - self.num_forking

            # ensure start indices are not negative
            start_idx_enc = max(0, enc_len_diff)
            start_idx_dec = max(0, dec_len_diff)

            # Define pad length indices for shorter time series of variable length being updated in place
            pad_length_enc = max(0, -enc_len_diff)
            pad_length_dec = max(0, -dec_len_diff)

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T
                ts_len = ts.shape[1]

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                out[self._past(ts_field)] = np.zeros(shape=(self.enc_len,
                                                            ts_len),
                                                     dtype=ts.dtype)
                if ts_field not in self.encoder_disabled_fields:
                    out[self._past(ts_field)][pad_length_enc:] = ts[
                        start_idx_enc:sampling_idx, :]

                # exclude some fields at prediction time
                if (not is_train
                        and ts_field in self.prediction_time_decoder_exclude):
                    continue

                if ts_field in self.decoder_series_fields:
                    out[self._future(ts_field)] = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    if ts_field not in self.decoder_disabled_fields:
                        # This is where some of the forking magic happens:
                        # For each of the num_forking time-steps at which the decoder is applied we slice the
                        # corresponding inputs called decoder_fields to the appropriate dec_len
                        decoder_fields = ts[start_idx_dec + 1:sampling_idx +
                                            1, :]
                        # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed,
                        # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows)
                        stride = decoder_fields.strides
                        out[self._future(
                            ts_field
                        )][pad_length_dec:] = as_strided(
                            decoder_fields,
                            shape=(
                                self.num_forking - pad_length_dec,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  For transposed data, strides =
                            # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )

                    # edge case for prediction_length = 1
                    if out[self._future(ts_field)].shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            out[self._future(ts_field)], axis=-1)

            # So far encoder pad indicator not in use -
            # Marks that left padding for the encoder will occur on shorter time series
            pad_indicator = np.zeros(self.enc_len)
            pad_indicator[:pad_length_enc] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx)

            yield out
예제 #27
0
 def transform(self, data: DataEntry) -> DataEntry:
     for k in self.field_names:
         data.pop(k, None)
     return data
예제 #28
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        dec_len = self.dec_len
        slice_cols = self.ts_fields + [self.target_in]
        target = data[self.target_in]

        if is_train:
            if len(target) < self.dec_len:
                # We currently cannot handle time series that are shorter than the
                # prediction length during training, so we just skip these.
                # If we want to include them we would need to pad and to mask
                # the loss.
                sampling_indices: List[int] = []
            else:
                sampling_indices = self.train_sampler(
                    target, 0, len(target) - self.dec_len
                )
        else:
            sampling_indices = [len(target)]

        for i in sampling_indices:
            pad_length = max(self.enc_len - i, 0)

            d = data.copy()
            for ts_field in slice_cols:
                if i > self.enc_len:
                    # truncate to past_length
                    past_piece = d[ts_field][..., i - self.enc_len : i]
                elif i < self.enc_len:
                    pad_block = np.zeros(
                        d[ts_field].shape[:-1] + (pad_length,)
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[ts_field][..., :i]], axis=-1
                    )
                else:
                    past_piece = d[ts_field][..., :i]

                d[self._past(ts_field)] = np.expand_dims(past_piece, -1)

                if is_train and ts_field is self.target_in:
                    forking_dec_field = np.zeros(
                        shape=(self.enc_len, self.dec_len)
                    )

                    for j in range(self.enc_len):
                        start_idx = i - self.enc_len + j + 1
                        if start_idx >= 0:
                            forking_dec_field[j, :] = d[ts_field][
                                ..., start_idx : start_idx + dec_len
                            ]

                    d[self._future(ts_field)] = forking_dec_field

                del d[ts_field]

            pad_indicator = np.zeros(self.enc_len)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_out)] = pad_indicator
            d[self.forecast_start_out] = shift_timestamp(d[self.start_in], i)
            yield d