def _inject_nans_in_target(data_entry: DataEntry, p: float) -> DataEntry: """ Returns a copy of the given `data_entry` where approximately `p` percent of the target values are NaNs. Parameters ---------- data_entry The data entry to use as source. p The fraction of target positions to set to NaN (between 0 and 1). Returns ------- A copy of `data_entry` with modified target field. """ nan_positions = np.sort(a=np.random.choice( a=np.arange(data_entry["target"].size, dtype=int), size=int(p * data_entry["target"].size), replace=False, )) nan_target = np.copy(data_entry["target"]) nan_target[nan_positions] = np.nan # if p < 1.0 at the last position should be kept unchanged # otherwise for large p we might end up with NaNs in the last # context_length positions if p < 1.0: nan_target[-1] = data_entry["target"][-1] return { key: (nan_target if key == "target" else val) for key, val in data_entry.items() }
def flatmap_transform( self, data: DataEntry, is_train: bool ) -> Iterator[DataEntry]: assert data[self.start_field].freq == data[self.end_field].freq total_interval_length = ( data[self.end_field] - data[self.start_field] ) / data[self.start_field].freq.delta # sample forecast start times in continuous time if is_train: if total_interval_length < ( self.future_interval_length + self.past_interval_length ): sampling_times: np.ndarray = np.array([]) else: sampling_times = self.train_sampler( self.past_interval_length, total_interval_length - self.future_interval_length, ) else: sampling_times = np.array([total_interval_length]) ia_times = data[self.target_field][0, :] marks = data[self.target_field][1:, :] ts = np.cumsum(ia_times) assert ts[-1] < total_interval_length, ( "Target interarrival times provided are inconsistent with " "start and end timestamps." ) # select field names that will be included in outputs keep_cols = { k: v for k, v in data.items() if k not in [self.target_field, self.start_field, self.end_field] } for future_start in sampling_times: r: DataEntry = dict() past_start = future_start - self.past_interval_length future_end = future_start + self.future_interval_length assert past_start >= 0 past_mask = self._mask_sorted(ts, past_start, future_start) past_ia_times = np.diff(np.r_[0, ts[past_mask] - past_start])[ np.newaxis ] r[f"past_{self.target_field}"] = np.concatenate( [past_ia_times, marks[:, past_mask]], axis=0 ).transpose() r["past_valid_length"] = np.array([len(past_mask)]) r[self.forecast_start_field] = ( data[self.start_field] + data[self.start_field].freq.delta * future_start ) if is_train: # include the future only if is_train assert future_end <= total_interval_length future_mask = self._mask_sorted(ts, future_start, future_end) future_ia_times = np.diff( np.r_[0, ts[future_mask] - future_start] )[np.newaxis] r[f"future_{self.target_field}"] = np.concatenate( [future_ia_times, marks[:, future_mask]], axis=0 ).transpose() r["future_valid_length"] = np.array([len(future_mask)]) # include other fields r.update(keep_cols.copy()) yield r