Пример #1
0
 def _update_cache(self, start: pd.Timestamp, length: int) -> None:
     end = shift_timestamp(start, length)
     if self._min_time_point is not None:
         if self._min_time_point <= start and end <= self._max_time_point:
             return
     if self._min_time_point is None:
         self._min_time_point = start
         self._max_time_point = end
     self._min_time_point = min(shift_timestamp(start, -50),
                                self._min_time_point)
     self._max_time_point = max(shift_timestamp(end, 50),
                                self._max_time_point)
     self.full_date_range = pd.date_range(self._min_time_point,
                                          self._max_time_point,
                                          freq=start.freq)
     self._date_index = pd.Series(
         index=self.full_date_range,
         data=np.arange(len(self.full_date_range)),
     )
Пример #2
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        target = data[self.target_field]

        sampled_indices = self.instance_sampler(target)

        slice_cols = (
            self.ts_fields
            + self.past_ts_fields
            + [self.target_field, self.observed_value_field]
        )
        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            d = data.copy()

            for field in slice_cols:
                if i >= self.past_length:
                    past_piece = d[field][..., i - self.past_length : i]
                else:
                    pad_block = np.full(
                        shape=d[field].shape[:-1] + (pad_length,),
                        fill_value=self.dummy_value,
                        dtype=d[field].dtype,
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[field][..., :i]], axis=-1
                    )
                future_piece = d[field][..., (i + lt) : (i + lt + pl)]
                if field in self.ts_fields:
                    piece = np.concatenate([past_piece, future_piece], axis=-1)
                    if self.output_NTC:
                        piece = piece.transpose()
                    d[field] = piece
                else:
                    if self.output_NTC:
                        past_piece = past_piece.transpose()
                        future_piece = future_piece.transpose()
                    if field not in self.past_ts_fields:
                        d[self._past(field)] = past_piece
                        d[self._future(field)] = future_piece
                        del d[field]
                    else:
                        d[field] = past_piece
            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i + lt
            )
            yield d
Пример #3
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(target, 0,
                                                  len(target) - self.dec_len)
        else:
            sampling_indices = [len(target)]

        # Loops over all encoder and decoder fields even those that are disabled to
        # set to dummy zero fields in those cases
        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields))

        for sampling_idx in sampling_indices:
            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            enc_len_diff = sampling_idx - self.enc_len
            dec_len_diff = sampling_idx - self.num_forking

            # ensure start indices are not negative
            start_idx_enc = max(0, enc_len_diff)
            start_idx_dec = max(0, dec_len_diff)

            # Define pad length indices for shorter time series of variable length being updated in place
            pad_length_enc = max(0, -enc_len_diff)
            pad_length_dec = max(0, -dec_len_diff)

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T
                ts_len = ts.shape[1]

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                out[self._past(ts_field)] = np.zeros(shape=(self.enc_len,
                                                            ts_len),
                                                     dtype=ts.dtype)
                if ts_field not in self.encoder_disabled_fields:
                    out[self._past(ts_field)][pad_length_enc:] = ts[
                        start_idx_enc:sampling_idx, :]

                # exclude some fields at prediction time
                if (not is_train
                        and ts_field in self.prediction_time_decoder_exclude):
                    continue

                if ts_field in self.decoder_series_fields:
                    out[self._future(ts_field)] = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    if ts_field not in self.decoder_disabled_fields:
                        # This is where some of the forking magic happens:
                        # For each of the num_forking time-steps at which the decoder is applied we slice the
                        # corresponding inputs called decoder_fields to the appropriate dec_len
                        decoder_fields = ts[start_idx_dec + 1:sampling_idx +
                                            1, :]
                        # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed,
                        # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows)
                        stride = decoder_fields.strides
                        out[self._future(
                            ts_field
                        )][pad_length_dec:] = as_strided(
                            decoder_fields,
                            shape=(
                                self.num_forking - pad_length_dec,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  For transposed data, strides =
                            # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )

                    # edge case for prediction_length = 1
                    if out[self._future(ts_field)].shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            out[self._future(ts_field)], axis=-1)

            # So far encoder pad indicator not in use -
            # Marks that left padding for the encoder will occur on shorter time series
            pad_indicator = np.zeros(self.enc_len)
            pad_indicator[:pad_length_enc] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx)

            yield out
Пример #4
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        sampled_indices = self.instance_sampler(target)

        ts_fields = set(self.encoder_series_fields +
                        self.decoder_series_fields)

        for idx in sampled_indices:
            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            enc_len_diff = idx - self.enc_len
            dec_len_diff = idx - self.num_forking

            # ensure start indices are not negative
            start_idx_enc = max(0, enc_len_diff)
            start_idx_dec = max(0, dec_len_diff)

            # Define pad length indices for shorter time series of variable length being updated in place
            pad_length_enc = max(0, -enc_len_diff)
            pad_length_dec = max(0, -dec_len_diff)

            for ts_field in ts_fields:

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T
                ts_len = ts.shape[1]

                del out[ts_field]

                out[self._past(ts_field)] = np.zeros(shape=(self.enc_len,
                                                            ts_len),
                                                     dtype=ts.dtype)
                if ts_field not in self.encoder_disabled_fields:
                    out[self._past(ts_field)][pad_length_enc:] = ts[
                        start_idx_enc:idx, :]

                if ts_field in self.decoder_series_fields:
                    out[self._future(ts_field)] = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    if ts_field not in self.decoder_disabled_fields:
                        # This is where some of the forking magic happens:
                        # For each of the num_forking time-steps at which the decoder is applied we slice the
                        # corresponding inputs called decoder_fields to the appropriate dec_len
                        decoder_fields = ts[start_idx_dec + 1:idx + 1, :]
                        # For default row-major arrays, strides = (dtype*n_cols, dtype). Since this array is transposed,
                        # it is stored in column-major (Fortran) ordering with strides = (dtype, dtype*n_rows)
                        stride = decoder_fields.strides
                        out[self._future(
                            ts_field
                        )][pad_length_dec:] = as_strided(
                            decoder_fields,
                            shape=(
                                self.num_forking - pad_length_dec,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  For transposed data, strides =
                            # (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )

                    # edge case for prediction_length = 1
                    if out[self._future(ts_field)].shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            out[self._future(ts_field)], axis=-1)

            # So far encoder pad indicator not in use -
            # Marks that left padding for the encoder will occur on shorter time series
            pad_indicator = np.zeros(self.enc_len)
            pad_indicator[:pad_length_enc] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], idx)

            yield out
Пример #5
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(target, 0,
                                                  len(target) - self.dec_len)
        else:
            sampling_indices = [len(target)]

        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields))

        for sampling_idx in sampling_indices:
            # ensure start index is not negative
            start_idx = max(0, sampling_idx - self.enc_len)

            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field])

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                # take enc_len values from ts, depending on sampling_idx
                slice = ts[:, start_idx:sampling_idx]

                # if we have less than enc_len values, pad_left with 0
                past_piece = pad_to_size(slice, self.enc_len)

                out[self._past(ts_field)] = past_piece.transpose()

                # exclude some fields at prediction time
                if (not is_train
                        and ts_field in self.prediction_time_decoder_exclude):
                    continue

                # This is were some of the forking magic happens:
                # For each of the encoder_len time-steps at which the decoder is applied we slice the
                # corresponding inputs called decoder_fields to the appropriate dec_len
                if (ts_field in self.decoder_series_fields +
                        self.decoder_disabled_fields):
                    forking_dec_field = np.zeros(shape=(self.enc_len,
                                                        self.dec_len, len(ts)))

                    # in case it's not disabled we copy the actual values
                    if ts_field not in self.decoder_disabled_fields:
                        skip = max(0, self.enc_len - sampling_idx)
                        # This section takes by far the longest time computationally:
                        # This scales linearly in self.enc_len and linearly in self.dec_len
                        for dec_field, idx in zip(
                                forking_dec_field[skip:],
                                range(start_idx + 1,
                                      start_idx + self.enc_len + 1),
                        ):
                            dec_field[:] = ts[:, idx:idx + self.dec_len].T

                    if forking_dec_field.shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            forking_dec_field, axis=-1)
                    else:
                        out[self._future(ts_field)] = forking_dec_field

            # So far pad indicator not in use
            pad_indicator = np.zeros(self.enc_len)
            pad_length = max(0, self.enc_len - sampling_idx)
            pad_indicator[:pad_length] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx)

            yield out
Пример #6
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        target = data[self.target_field]

        if is_train:
            # We currently cannot handle time series that are shorter than the
            # prediction length during training, so we just skip these.
            # If we want to include them we would need to pad and to mask
            # the loss.
            if len(target) < self.dec_len:
                return

            sampling_indices = self.train_sampler(
                target, 0, len(target) - self.dec_len
            )
        else:
            sampling_indices = [len(target)]

        # Loops over all encoder and decoder fields even those that are disabled to
        # set to dummy zero fields in those cases
        ts_fields_counter = Counter(
            set(self.encoder_series_fields + self.decoder_series_fields)
        )

        for sampling_idx in sampling_indices:
            # ensure start index is not negative
            start_idx = max(0, sampling_idx - self.enc_len)

            # irrelevant data should have been removed by now in the
            # transformation chain, so copying everything is ok
            out = data.copy()

            for ts_field in list(ts_fields_counter.keys()):

                # target is 1d, this ensures ts is always 2d
                ts = np.atleast_2d(out[ts_field]).T

                if ts_fields_counter[ts_field] == 1:
                    del out[ts_field]
                else:
                    ts_fields_counter[ts_field] -= 1

                # take enc_len values from ts, depending on sampling_idx
                slice = ts[start_idx:sampling_idx, :]

                ts_len = ts.shape[1]
                past_piece = np.zeros(
                    shape=(self.enc_len, ts_len), dtype=ts.dtype
                )

                if ts_field not in self.encoder_disabled_fields:
                    # if we have less than enc_len values, pad_left with 0
                    past_piece = pad_to_size(slice, self.enc_len)
                out[self._past(ts_field)] = past_piece

                # exclude some fields at prediction time
                if (
                    not is_train
                    and ts_field in self.prediction_time_decoder_exclude
                ):
                    continue

                # This is were some of the forking magic happens:
                # For each of the encoder_len time-steps at which the decoder is applied we slice the
                # corresponding inputs called decoder_fields to the appropriate dec_len
                if ts_field in self.decoder_series_fields:

                    forking_dec_field = np.zeros(
                        shape=(self.num_forking, self.dec_len, ts_len),
                        dtype=ts.dtype,
                    )
                    # in case it's not disabled we copy the actual values
                    if ts_field not in self.decoder_disabled_fields:
                        # In case we sample and index too close to the beginning of the time series we would run out of
                        # bounds (i.e. try to copy non existent time series data) to prepare the input for the decoder.
                        # Instead of copying the partially available data from the time series and padding it with
                        # zeros, we simply skip copying the partial data. Since copying data would result in overriding
                        # the 0 pre-initialized 3D array, the end result of skipping is that the affected 2D decoder
                        # inputs (entries of the 3D array - of which there are skip many) will still be all 0."
                        skip = max(0, self.num_forking - sampling_idx)
                        start_idx = max(0, sampling_idx - self.num_forking)
                        # For 2D column-major (Fortran) ordering transposed array strides = (dtype, dtype*n_rows)
                        # For standard row-major arrays, strides = (dtype*n_cols, dtype)
                        stride = ts.strides
                        forking_dec_field[skip:, :, :] = as_strided(
                            ts[
                                start_idx
                                + 1 : start_idx
                                + 1
                                + self.num_forking
                                - skip,
                                :,
                            ],
                            shape=(
                                self.num_forking - skip,
                                self.dec_len,
                                ts_len,
                            ),
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # strides for 2D array expanded to 3D array of shape (dim1, dim2, dim3) =
                            # (1, n_rows, n_cols).  Note since this array has been transposed, it is stored in
                            # column-major (Fortan) ordering, i.e. for transposed data of shape (dim1, dim2, dim3),
                            # strides = (dtype, dtype * dim1, dtype*dim1*dim2) = (dtype, dtype, dtype*n_rows).
                            strides=stride[0:1] + stride,
                        )
                    # edge case for prediction_length = 1
                    if forking_dec_field.shape[-1] == 1:
                        out[self._future(ts_field)] = np.squeeze(
                            forking_dec_field, axis=-1
                        )
                    else:
                        out[self._future(ts_field)] = forking_dec_field

            # So far pad indicator not in use
            pad_indicator = np.zeros(self.enc_len)
            pad_length = max(0, self.enc_len - sampling_idx)
            pad_indicator[:pad_length] = True
            out[self._past(self.is_pad_out)] = pad_indicator

            # So far pad forecast_start not in use
            out[FieldName.FORECAST_START] = shift_timestamp(
                out[self.start_in], sampling_idx
            )

            yield out
Пример #7
0
    def flatmap_transform(self, data: DataEntry,
                          is_train: bool) -> Iterator[DataEntry]:
        pl = self.future_length
        lt = self.lead_time
        target = data[self.target_field]
        len_target = target.shape[-1]

        minimum_length = (self.future_length
                          if self.pick_incomplete else self.past_length +
                          self.future_length) + self.lead_time

        if is_train:
            sampling_bounds = ((
                0,
                len_target - self.future_length - self.lead_time,
            ) if self.pick_incomplete else (
                self.past_length,
                len_target - self.future_length - self.lead_time,
            ))

            # We currently cannot handle time series that are
            # too short during training, so we just skip these.
            # If we want to include them we would need to pad and to
            # mask the loss.
            sampled_indices = (np.array([], dtype=int)
                               if len_target < minimum_length else
                               self.train_sampler(target, *sampling_bounds))
        else:
            assert self.pick_incomplete or len_target >= self.past_length
            sampled_indices = np.array([len_target], dtype=int)

        slice_cols = (self.ts_fields + self.past_ts_fields +
                      [self.target_field, self.observed_value_field])
        for i in sampled_indices:
            pad_length = max(self.past_length - i, 0)
            if not self.pick_incomplete and pad_length > 0:
                raise RuntimeError(
                    f"pad_length should be zero, got {pad_length}")
            d = data.copy()

            for field in slice_cols:
                if i >= self.past_length:
                    past_piece = d[field][..., i - self.past_length:i]
                else:
                    pad_block = (np.ones(
                        d[field].shape[:-1] + (pad_length, ),
                        dtype=d[field].dtype,
                    ) * self.dummy_value)
                    past_piece = np.concatenate([pad_block, d[field][..., :i]],
                                                axis=-1)
                future_piece = d[field][..., (i + lt):(i + lt + pl)]
                if field in self.ts_fields:
                    piece = np.concatenate([past_piece, future_piece], axis=-1)
                    if self.output_NTC:
                        piece = piece.transpose()
                    d[field] = piece
                else:
                    if self.output_NTC:
                        past_piece = past_piece.transpose()
                        future_piece = future_piece.transpose()
                    d[self._past(field)] = past_piece
                    if field not in self.past_ts_fields:
                        d[self._future(field)] = future_piece
                    del d[field]

            pad_indicator = np.zeros(self.past_length)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_field)] = pad_indicator
            d[self.forecast_start_field] = shift_timestamp(
                d[self.start_field], i + lt)
            yield d
Пример #8
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:
        dec_len = self.dec_len
        slice_cols = self.ts_fields + [self.target_in]
        target = data[self.target_in]

        if is_train:
            if len(target) < self.dec_len:
                # We currently cannot handle time series that are shorter than the
                # prediction length during training, so we just skip these.
                # If we want to include them we would need to pad and to mask
                # the loss.
                sampling_indices: List[int] = []
            else:
                sampling_indices = self.train_sampler(
                    target, 0, len(target) - self.dec_len
                )
        else:
            sampling_indices = [len(target)]

        for i in sampling_indices:
            pad_length = max(self.enc_len - i, 0)

            d = data.copy()
            for ts_field in slice_cols:
                if i > self.enc_len:
                    # truncate to past_length
                    past_piece = d[ts_field][..., i - self.enc_len : i]
                elif i < self.enc_len:
                    pad_block = np.zeros(
                        d[ts_field].shape[:-1] + (pad_length,)
                    )
                    past_piece = np.concatenate(
                        [pad_block, d[ts_field][..., :i]], axis=-1
                    )
                else:
                    past_piece = d[ts_field][..., :i]

                d[self._past(ts_field)] = np.expand_dims(past_piece, -1)

                if is_train and ts_field is self.target_in:
                    forking_dec_field = np.zeros(
                        shape=(self.enc_len, self.dec_len)
                    )

                    for j in range(self.enc_len):
                        start_idx = i - self.enc_len + j + 1
                        if start_idx >= 0:
                            forking_dec_field[j, :] = d[ts_field][
                                ..., start_idx : start_idx + dec_len
                            ]

                    d[self._future(ts_field)] = forking_dec_field

                del d[ts_field]

            pad_indicator = np.zeros(self.enc_len)
            if pad_length > 0:
                pad_indicator[:pad_length] = 1
            d[self._past(self.is_pad_out)] = pad_indicator
            d[self.forecast_start_out] = shift_timestamp(d[self.start_in], i)
            yield d