def with_new_start_times( self, start_times: Union[np.ndarray, Sequence]) -> 'TimeSeriesDataset': """ Subset a TimeSeriesDataset so that some/all of the groups have later start times. :param start_times: An array/list of new datetimes. :return: A new TimeSeriesDataset. """ new_tensors = [] for i, tens in enumerate(self.tensors): times = self.times(i) new_tens = [] for g, (new_time, old_times) in enumerate(zip(start_times, times)): if (old_times <= new_time).all(): raise ValueError( f"{new_time} is later than all the times for group {self.group_names[g]}" ) elif (old_times > new_time).all(): raise ValueError( f"{new_time} is earlier than all the times for group {self.group_names[g]}" ) # drop if before new_time: g_tens = tens[g, true1d_idx(old_times >= new_time), :] # drop if after last nan: all_nan, _ = torch.min(torch.isnan(g_tens), 1) end_idx = true1d_idx(~all_nan).max() + 1 new_tens.append(g_tens[:end_idx].unsqueeze(0)) new_tens = ragged_cat(new_tens, ragged_dim=1, cat_dim=0) new_tensors.append(new_tens) return type(self)(*new_tensors, group_names=self.group_names, start_times=start_times, measures=self.measures, dt_unit=self.dt_unit)
def tensor_to_dataframe(tensor: Tensor, times: np.ndarray, group_names: Sequence, group_colname: str, time_colname: str, measures: Sequence[str]) -> 'DataFrame': from pandas import DataFrame, concat tensor = tensor.data.numpy() assert tensor.shape[0] == len(group_names) assert tensor.shape[0] == len(times) assert tensor.shape[1] <= times.shape[1] assert tensor.shape[2] == len(measures) dfs = [] for g, group_name in enumerate(group_names): # get values, don't store trailing nans: values = tensor[g] all_nan_per_row = np.min(np.isnan(values), axis=1) if all_nan_per_row.all(): warn(f"Group {group_name} has only missing values.") continue end_idx = true1d_idx(~all_nan_per_row).max() + 1 # convert to dataframe: df = DataFrame(data=values[:end_idx, :], columns=measures) df[group_colname] = group_name df[time_colname] = np.nan df[time_colname] = times[g, 0:len(df.index)] dfs.append(df) return concat(dfs)
def get_groups(self, groups: Sequence[Any]) -> 'TimeSeriesDataset': """ Get the subset of the batch corresponding to groups. Note that the ordering in the output will match the original ordering (not that of `group`), and that duplicates will be dropped. """ group_idx = true1d_idx(np.isin(self.group_names, groups)) return self[group_idx]
def train_val_split( self, train_frac: float = None, dt: Union[np.datetime64, dict] = None ) -> Tuple['TimeSeriesDataset', 'TimeSeriesDataset']: """ :param train_frac: The proportion of the data to keep for training. This is calculated on a per-group basis, by taking the last observation for each group (i.e., the last observation that a non-nan value on any measure). If neither `train_frac` nor `dt` are passed, `train_frac=.75` is used. :param dt: A datetime to use in dividing train/validation (first datetime for validation), or a dictionary of group-names : date-times. :return: Two TimeSeriesDatasets, one with data before the split, the other with >= the split. """ # get split times: if dt is None: if train_frac is None: train_frac = .75 assert 0 < train_frac < 1 # for each group, find the last non-nan, take `frac` of that to find the train/val split point: split_idx = np.array( [int(idx * train_frac) for idx in self._last_measured_idx()], dtype='int') _times = self.times(0) split_times = np.array( [_times[i, t] for i, t in enumerate(split_idx)]) else: if train_frac is not None: raise TypeError("Can pass only one of `train_frac`, `dt`.") if isinstance(dt, dict): split_times = np.array( [dt[group_name] for group_name in self.group_names], dtype='datetime64[ns]') else: if not isinstance(dt, np.datetime64): dt = np.datetime64(dt, self.dt_unit) split_times = np.full(shape=len(self.group_names), fill_value=dt) # val: val_dataset = self.with_new_start_times(split_times) # train: train_tensors = [] for i, tens in enumerate(self.tensors): train = tens.clone() train[np.where( self.times(i) >= split_times[:, None])] = float('nan') if i == 0: not_all_nan = (~torch.isnan(train)).sum((0, 2)) last_good_idx = true1d_idx(not_all_nan).max() train = train[:, :(last_good_idx + 1), :] train_tensors.append(train) # TODO: replace padding nans for all but first tensor? # TODO: reduce width of 0> tensors based on width of 0 tensor? train_dataset = self.with_new_tensors(*train_tensors) return train_dataset, val_dataset
def _last_measured_idx(self) -> np.ndarray: """ :return: The indices of the last measurement in the first tensor, where a measurement is any non-nan value in at least on dimension. """ tens, *_ = self.tensors any_measured_bool = ~np.isnan(tens.numpy()).all(2) last_measured_idx = np.array( [np.max(true1d_idx(any_measured_bool[g]), initial=0) for g in range(len(self.group_names))], dtype='int' ) return last_measured_idx