示例#1
0
def _inject_nans_in_target(data_entry: DataEntry, p: float) -> DataEntry:
    """
    Returns a copy of the given `data_entry` where approximately `p` percent
    of the target values are NaNs.

    Parameters
    ----------
    data_entry
        The data entry to use as source.
    p
        The fraction of target positions to set to NaN (between 0 and 1).

    Returns
    -------
        A copy of `data_entry` with modified target field.
    """
    nan_positions = np.sort(a=np.random.choice(
        a=np.arange(data_entry["target"].size, dtype=int),
        size=int(p * data_entry["target"].size),
        replace=False,
    ))

    nan_target = np.copy(data_entry["target"])
    nan_target[nan_positions] = np.nan

    # if p < 1.0 at the last position should be kept unchanged
    # otherwise for large p we might end up with NaNs in the last
    # context_length positions
    if p < 1.0:
        nan_target[-1] = data_entry["target"][-1]

    return {
        key: (nan_target if key == "target" else val)
        for key, val in data_entry.items()
    }
示例#2
0
    def flatmap_transform(
        self, data: DataEntry, is_train: bool
    ) -> Iterator[DataEntry]:

        assert data[self.start_field].freq == data[self.end_field].freq

        total_interval_length = (
            data[self.end_field] - data[self.start_field]
        ) / data[self.start_field].freq.delta

        # sample forecast start times in continuous time
        if is_train:
            if total_interval_length < (
                self.future_interval_length + self.past_interval_length
            ):
                sampling_times: np.ndarray = np.array([])
            else:
                sampling_times = self.train_sampler(
                    self.past_interval_length,
                    total_interval_length - self.future_interval_length,
                )
        else:
            sampling_times = np.array([total_interval_length])

        ia_times = data[self.target_field][0, :]
        marks = data[self.target_field][1:, :]

        ts = np.cumsum(ia_times)
        assert ts[-1] < total_interval_length, (
            "Target interarrival times provided are inconsistent with "
            "start and end timestamps."
        )

        # select field names that will be included in outputs
        keep_cols = {
            k: v
            for k, v in data.items()
            if k not in [self.target_field, self.start_field, self.end_field]
        }

        for future_start in sampling_times:

            r: DataEntry = dict()

            past_start = future_start - self.past_interval_length
            future_end = future_start + self.future_interval_length

            assert past_start >= 0

            past_mask = self._mask_sorted(ts, past_start, future_start)

            past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[
                np.newaxis
            ]

            r[f"past_{self.target_field}"] = np.concatenate(
                [past_ia_times, marks[:, past_mask]], axis=0
            ).transpose()

            r["past_valid_length"] = np.array([len(past_mask)])

            r[self.forecast_start_field] = (
                data[self.start_field]
                + data[self.start_field].freq.delta * future_start
            )

            if is_train:  # include the future only if is_train
                assert future_end <= total_interval_length

                future_mask = self._mask_sorted(ts, future_start, future_end)

                future_ia_times = np.diff(
                    np.r_[0, ts[future_mask] - future_start]
                )[np.newaxis]

                r[f"future_{self.target_field}"] = np.concatenate(
                    [future_ia_times, marks[:, future_mask]], axis=0
                ).transpose()

                r["future_valid_length"] = np.array([len(future_mask)])

            # include other fields
            r.update(keep_cols.copy())

            yield r