def __init__(self, *tensors: Tensor, group_names: Sequence[Any], start_times: Union[np.ndarray, Sequence], measures: Sequence[Sequence[str]], dt_unit: Optional[str]): if not isinstance(group_names, np.ndarray): group_names = np.array(group_names) assert len(group_names) == len(start_times) assert len(tensors) == len(measures) for i, (tensor, tensor_measures) in enumerate(zip(tensors, measures)): if len(tensor.shape) < 3: raise ValueError(f"Tensor {i} has < 3 dimensions") if tensor.shape[0] != len(group_names): raise ValueError(f"Tensor {i}'s first dimension has length != {len(group_names)}.") if tensor.shape[2] != len(tensor_measures): raise ValueError(f"Tensor {i}'s 3rd dimension has length != len({tensor_measures}).") self.measures = tuple(tuple(m) for m in measures) self.all_measures = tuple(itertools.chain.from_iterable(self.measures)) self.group_names = group_names self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=None) self.start_times = self._dt_helper.validate_datetimes(start_times) self.dt_unit = dt_unit super().__init__(*tensors)
def fourier_model_mat(datetimes: np.ndarray, K: int, period: Union[np.timedelta64, str], output_fmt: str = 'float64') -> np.ndarray: """ :param datetimes: An array of datetimes. :param K: The expansion integer. :param period: Either a np.timedelta64, or one of {'weekly','yearly','daily'} :param start_datetime: A np.datetime64 on which to consider the season-start; useful for aligning (e.g) weekly seasons to start on Monday, or daily seasons to start on a particular hour. Default is first monday after epoch. :param output_fmt: A numpy dtype, or 'dataframe' to output a dataframe. :return: A numpy array (or dataframe) with the expanded fourier series. """ # parse period: name = 'fourier' if isinstance(period, str): name = period if period == 'weekly': period = np.timedelta64(7, 'D') elif period == 'yearly': period = np.timedelta64(int(365.25 * 24), 'h') elif period == 'daily': period = np.timedelta64(24, 'h') else: raise ValueError("Unrecognized `period`.") period_int = period.view('int64') dt_helper = DateTimeHelper(dt_unit=np.datetime_data(period)[0]) time = dt_helper.validate_datetimes(datetimes).view('int64') output_dataframe = (output_fmt.lower() == 'dataframe') if output_dataframe: output_fmt = 'float64' # fourier matrix: out_shape = tuple(datetimes.shape) + (K * 2, ) out = np.empty(out_shape, dtype=output_fmt) columns = [] for idx in range(K): k = idx + 1 for is_cos in range(2): val = 2. * np.pi * k * time / period_int out[..., idx * 2 + is_cos] = np.sin(val) if is_cos == 0 else np.cos(val) columns.append(f"{name}_K{k}_{'cos' if is_cos else 'sin'}") if output_dataframe: if len(out_shape) > 2: raise ValueError( "Cannot output dataframe when input is 2+D array.") from pandas import DataFrame out = DataFrame(out, columns=columns) return out
def __init__(self, id: str, seasonal_period: int, season_duration: int = 1, decay: Union[bool, Tuple[float, float]] = False, dt_unit: Optional[str] = None, fixed: bool = False): """ :param id: Unique name for this process :param seasonal_period: The number of seasons (e.g. 7 for day_in_week). :param season_duration: The length of each season, default 1 time-step. :param decay: Optional (float,float) boundaries for decay (between 0 and 1). Analogous to dampening a trend -- the state will revert to zero as we get further from the last observation. This can be useful if two processes are capturing the same seasonal pattern: one can be more flexible, but with decay have a tendency to revert to zero, while the other is less variable but extrapolates into the future. :param dt_unit: Currently supports {'Y', 'D', 'h', 'm', 's'}. 'W' is experimentally supported. :param fixed: If True, then the seasonality does not vary over time, and this amounts to one-hot-encoding the seasons. Default False. """ # self.seasonal_period = seasonal_period self.season_duration = season_duration self.fixed = fixed if dt_unit is None: # optional for some seasonal processes, but not for this one raise TypeError(f"Must pass `dt_unit` to {type(self).__name__}") self._dt_helper = DateTimeHelper(dt_unit=dt_unit) # state-elements: pad_n = len(str(seasonal_period)) super().__init__( id=id, state_elements=[self.measured_name] + [zpad(i, pad_n) for i in range(1, seasonal_period)] ) # transitions are placeholders, filled in w/batch for i, current in enumerate(self.state_elements): self._set_transition(from_element=current, to_element=current, value=0.) if i > 0: prev = self.state_elements[i - 1] self._set_transition(from_element=prev, to_element=current, value=0.) if i > 1: self._set_transition(from_element=prev, to_element=self.measured_name, value=0.) if decay: assert not isinstance(decay, bool), "decay should be floats of bounds (or False for no decay)" assert decay[0] > 0. and decay[1] <= 1.0 self.decay = Bounded(*decay) else: self.decay = None
def __init__(self, id: str, seasonal_period: Union[int, float], K: Union[int, float], decay: Union[bool, Tuple[float, float]] = False, season_start: Optional[str] = None, dt_unit: Optional[str] = None): # season structure: self.seasonal_period = seasonal_period if isinstance(K, float): assert K.is_integer() self.K = int(K) self.decay = None if decay: assert decay[0] > 0. and decay[1] <= 1.0 self.decay = Bounded(*decay) state_elements, list_of_trans_kwargs = self._setup(decay=decay) super().__init__(id=id, state_elements=state_elements) self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=season_start) for trans_kwargs in list_of_trans_kwargs: self._set_transition(**trans_kwargs)
def __init__(self, K: int, period: Union[np.timedelta64, str], dt_unit: str, num_outputs: int, start_datetime: Optional[np.datetime64] = None, bias: bool = False): self.K = K self.period = period self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=start_datetime) super().__init__(in_features=K * 2, out_features=num_outputs, bias=bias)
class TimeSeriesDataset(NiceRepr, TensorDataset): """ TimeSeriesDataset includes additional information about each of the Tensors' dimensions: the name for each group in the first dimension, the start (date)time (and optionally datetime-unit) for the second dimension, and the name of the measures for the third dimension. Note that unlike TensorDataset, indexing a TimeSeriesDataset returns another TimeSeriesDataset, not a tuple of tensors. So when using TimeSeriesDataset, use `TimeSeriesDataLoader` (or just use `DataLoader(collate_fn=TimeSeriesDataset.collate)`). """ supported_dt_units = {'Y', 'D', 'h', 'm', 's'} _repr_attrs = ('sizes', 'measures') def __init__(self, *tensors: Tensor, group_names: Sequence[Any], start_times: Union[np.ndarray, Sequence], measures: Sequence[Sequence[str]], dt_unit: Optional[str]): if not isinstance(group_names, np.ndarray): group_names = np.array(group_names) assert len(group_names) == len(start_times) assert len(tensors) == len(measures) for i, (tensor, tensor_measures) in enumerate(zip(tensors, measures)): if len(tensor.shape) < 3: raise ValueError(f"Tensor {i} has < 3 dimensions") if tensor.shape[0] != len(group_names): raise ValueError(f"Tensor {i}'s first dimension has length != {len(group_names)}.") if tensor.shape[2] != len(tensor_measures): raise ValueError(f"Tensor {i}'s 3rd dimension has length != len({tensor_measures}).") self.measures = tuple(tuple(m) for m in measures) self.all_measures = tuple(itertools.chain.from_iterable(self.measures)) self.group_names = group_names self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=None) self.start_times = self._dt_helper.validate_datetimes(start_times) self.dt_unit = dt_unit super().__init__(*tensors) @property def sizes(self) -> Sequence: return [t.size() for t in self.tensors] # Subsetting ------------------------: def train_val_split(self, train_frac: float = None, dt: np.datetime64 = None) -> Tuple['TimeSeriesDataset', 'TimeSeriesDataset']: """ :param train_frac: The proportion of the data to keep for training. This is calculated on a per-group basis, by taking the last observation for each group (i.e., the last observation that a non-nan value on any measure). If neither `train_frac` nor `dt` are passed, `train_frac=.75` is used. :param dt: A datetime to use in dividing train/validation (first datetime for validation). :return: Two TimeSeriesDatasets, one with data before the split, the other with >= the split. """ # get split times: if dt is None: if train_frac is None: train_frac = .75 assert 0 < train_frac < 1 # for each group, find the last non-nan, take `frac` of that to find the train/val split point: split_idx = np.array([int(idx * train_frac) for idx in self._last_measured_idx()], dtype='int') split_times = np.take(self.times(0), split_idx) else: if train_frac is not None: raise TypeError("Can pass only one of `train_frac`, `dt`.") if not isinstance(dt, np.datetime64): dt = np.datetime64(dt, self.dt_unit) split_times = np.full(shape=len(self.group_names), fill_value=dt) # val: val_dataset = self.with_new_start_times(split_times) # train: train_tensors = [] for i, tens in enumerate(self.tensors): train = tens.data.clone() train[np.where(self.times(i) >= split_times[:, None])] = float('nan') not_all_nan = (~torch.isnan(train)).sum((0, 2)) last_good_idx = true1d_idx(not_all_nan).max() train = train[:, :(last_good_idx + 1), :] train_tensors.append(train) train_dataset = self.with_new_tensors(*train_tensors) return train_dataset, val_dataset def with_new_start_times(self, start_times: Union[np.ndarray, Sequence]) -> 'TimeSeriesDataset': """ Subset a TimeSeriesDataset so that some/all of the groups have later start times. :param start_times: An array/list of new datetimes. :return: A new TimeSeriesDataset. """ new_tensors = [] for i, tens in enumerate(self.tensors): times = self.times(i) new_tens = [] for g, (new_time, old_times) in enumerate(zip(start_times, times)): if (old_times <= new_time).all(): raise ValueError(f"{new_time} is later than all the times for group {self.group_names[g]}") elif (old_times > new_time).all(): raise ValueError(f"{new_time} is earlier than all the times for group {self.group_names[g]}") new_tens.append(tens[g, true1d_idx(old_times >= new_time), :].unsqueeze(0)) new_tens = ragged_cat(new_tens, ragged_dim=1, cat_dim=0) new_tensors.append(new_tens) return type(self)( *new_tensors, group_names=self.group_names, start_times=start_times, measures=self.measures, dt_unit=self.dt_unit ) def get_groups(self, groups: Sequence[Any]) -> 'TimeSeriesDataset': """ Get the subset of the batch corresponding to groups. Note that the ordering in the output will match the original ordering (not that of `group`), and that duplicates will be dropped. """ group_idx = true1d_idx(np.isin(self.group_names, groups)) return self[group_idx] def split_measures(self, *measure_groups) -> 'TimeSeriesDataset': """ Take a dataset with one tensor, split it into a dataset with multiple tensors. :param measure_groups: Each argument should be be a list of measure-names, or an indexer (i.e. list of ints or a slice). :return: A TimeSeriesDataset, now with multiple tensors for the measure-groups """ if len(self.measures) > 1: raise RuntimeError(f"Can only split measures if there's only one group, but instead:\n{self.measures}") self_tensor = self.tensors[0] self_measures = self.measures[0] idxs = [] for measure_group in measure_groups: if isinstance(measure_group, slice) or isinstance(measure_group[0], int): idxs.append(measure_group) else: idxs.append([self_measures.index(m) for m in measure_group]) self_measures = np.array(self_measures) return type(self)( *(self_tensor[:, :, idx] for idx in idxs), start_times=self.start_times, group_names=self.group_names, measures=[tuple(self_measures[idx]) for idx in idxs], dt_unit=self.dt_unit ) def __getitem__(self, item: Union[int, Sequence, slice]) -> 'TimeSeriesDataset': if isinstance(item, int): item = [item] return type(self)( *super(TimeSeriesDataset, self).__getitem__(item), group_names=self.group_names[item], start_times=self.start_times[item], measures=self.measures, dt_unit=self.dt_unit ) # Creation/Transformation ------------------------: @classmethod def collate(cls, batch: Sequence['TimeSeriesDataset']) -> 'TimeSeriesDataset': to_concat = { 'tensors': [batch[0].tensors], 'group_names': [batch[0].group_names], 'start_times': [batch[0].start_times] } fixed = {'dt_unit': batch[0].dt_unit, 'measures': batch[0].measures} for i, ts_dataset in enumerate(batch[1:], 1): for attr, appendlist in to_concat.items(): to_concat[attr].append(getattr(ts_dataset, attr)) for attr, required_val in fixed.items(): new_val = getattr(ts_dataset, attr) if new_val != required_val: raise ValueError(f"Element {i} has `{attr}` = {new_val}, but for element 0 it's {required_val}.") tensors = tuple(ragged_cat(t, ragged_dim=1) for t in zip(*to_concat['tensors'])) return cls( *tensors, group_names=np.concatenate(to_concat['group_names']), start_times=np.concatenate(to_concat['start_times']), measures=fixed['measures'], dt_unit=fixed['dt_unit'] ) def to_dataframe(self, group_colname: str = 'group', time_colname: str = 'time' ) -> 'DataFrame': return self.tensor_to_dataframe( tensor=torch.cat(self.tensors, 2), times=self.times(), group_names=self.group_names, group_colname=group_colname, time_colname=time_colname, measures=self.all_measures ) @staticmethod def tensor_to_dataframe(tensor: Tensor, times: np.ndarray, group_names: Sequence, group_colname: str, time_colname: str, measures: Sequence[str]) -> 'DataFrame': from pandas import DataFrame, concat tensor = tensor.data.numpy() assert tensor.shape[0] == len(group_names) assert tensor.shape[0] == len(times) assert tensor.shape[1] <= times.shape[1] assert tensor.shape[2] == len(measures) dfs = [] for g, group_name in enumerate(group_names): # get values, don't store trailing nans: values = tensor[g] all_nan_per_row = np.min(np.isnan(values), axis=1) if all_nan_per_row.all(): warn(f"Group {group_name} has only missing values.") continue end_idx = true1d_idx(~all_nan_per_row).max() + 1 # convert to dataframe: df = DataFrame(data=values[:end_idx, :], columns=measures) df[group_colname] = group_name df[time_colname] = np.nan df[time_colname] = times[g, 0:len(df.index)] dfs.append(df) return concat(dfs) @classmethod def from_dataframe(cls, dataframe: 'DataFrame', group_colname: str, time_colname: str, dt_unit: Optional[str], measure_colnames: Optional[Sequence[str]] = None, X_colnames: Optional[Sequence[str]] = None, y_colnames: Optional[Sequence[str]] = None) -> 'TimeSeriesDataset': if measure_colnames is None: if X_colnames is None or y_colnames is None: raise ValueError("Must pass either `measure_colnames` or `X_colnames` & `y_colnames`") measure_colnames = list(y_colnames) + list(X_colnames) else: if X_colnames is not None or y_colnames is not None: raise ValueError("If passing `measure_colnames` do not pass `X_colnames` or `y_colnames`.") assert isinstance(group_colname, str) assert isinstance(time_colname, str) assert len(measure_colnames) == len(set(measure_colnames)) # sort by time: dataframe = dataframe.sort_values(time_colname) for measure_colname in measure_colnames: if measure_colname not in dataframe.columns: raise ValueError(f"'{measure_colname}' not in dataframe.columns:\n{dataframe.columns}'") # first pass for info: arrays, time_idxs, group_names, start_times = [], [], [], [] for g, df in dataframe.groupby(group_colname, sort=True): # group-names: group_names.append(g) # times: times = df[time_colname].values assert len(times) == len(set(times)), f"Group {g} has duplicate times" min_time = times[0] start_times.append(min_time) if dt_unit is None: time_idx = (times - min_time).astype('int64') else: time_idx = (times - min_time).astype(f'timedelta64[{dt_unit}]').view('int64') time_idxs.append(time_idx) # values: arrays.append(df.loc[:, measure_colnames].values) # second pass organizes into tensor time_len = max(time_idx[-1] + 1 for time_idx in time_idxs) tens = torch.empty((len(arrays), time_len, len(measure_colnames))) tens[:] = np.nan for i, (array, time_idx) in enumerate(zip(arrays, time_idxs)): tens[i, time_idx, :] = Tensor(array) dataset = cls( tens, group_names=group_names, start_times=start_times, measures=[measure_colnames], dt_unit=dt_unit ) if X_colnames is not None: dataset = dataset.split_measures(y_colnames, X_colnames) y, X = dataset.tensors # don't use nan-padding on the predictor tensor: for i, time_idx in enumerate(time_idxs): X[:, time_idx.max():, :] = 0.0 return dataset def with_new_tensors(self, *tensors: Tensor) -> 'TimeSeriesDataset': """ Create a new Batch with a different Tensor, but all other attributes the same. """ return type(self)( *tensors, group_names=self.group_names, start_times=self.start_times, measures=self.measures, dt_unit=self.dt_unit ) # Util/Private ------------------------ def times(self, which: Optional[int] = None) -> np.ndarray: """ A 2D array of datetimes (or integers if dt_unit is None) for this dataset. :param which: If this dataset has multiple tensors of different number of timesteps, which should be used for constructing the `times` array? Defaults to the one with the most timesteps. :return: A 2D numpy array of datetimes (or integers if dt_unit is None). """ if which is None: num_timesteps = max(tensor.shape[1] for tensor in self.tensors) else: num_timesteps = self.tensors[which].shape[1] return self._dt_helper.make_grid(self.start_times, num_timesteps) def datetimes(self) -> np.ndarray: return self.times() @property def start_datetimes(self) -> np.ndarray: return self.start_times def last_measured_times(self) -> np.ndarray: """ :return: The datetimes (or integers if dt_unit is None) for the last measurement in the first tensor, where a measurement is any non-nan value in at least one dimension. """ times = self.times(which=0) last_measured_idx = self._last_measured_idx() return np.array([t[idx] for t, idx in zip(times, last_measured_idx)], dtype=f'datetime64[{self.dt_unit}]') def _last_measured_idx(self) -> np.ndarray: """ :return: The indices of the last measurement in the first tensor, where a measurement is any non-nan value in at least on dimension. """ tens, *_ = self.tensors any_measured_bool = ~np.isnan(tens.numpy()).all(2) last_measured_idx = np.array( [np.max(true1d_idx(any_measured_bool[g]), initial=0) for g in range(len(self.group_names))], dtype='int' ) return last_measured_idx
class Season(Process): """ Process representing discrete seasons. """ measured_name = 'measured' def __init__(self, id: str, seasonal_period: int, season_duration: int = 1, decay: Union[bool, Tuple[float, float]] = False, season_start: Optional[str] = None, dt_unit: Optional[str] = None, fixed: bool = False): """ :param id: Unique name for this process :param seasonal_period: The number of seasons (e.g. 7 for day_in_week). :param season_duration: The length of each season, default 1 time-step. :param decay: Optional (float,float) boundaries for decay (between 0 and 1). Analogous to dampening a trend -- the state will revert to zero as we get further from the last observation. This can be useful if two processes are capturing the same seasonal pattern: one can be more flexible, but with decay have a tendency to revert to zero, while the other is less variable but extrapolates into the future. :param season_start: A string that can be parsed into a datetime by `numpy.datetime64`. This is when the season starts, which is useful to specify if season boundaries are meaningful. It is important to specify if different groups in your dataset start on different dates; when calling the kalman-filter you'll pass an array of `start_datetimes` for group in the input, and this will be used to align the seasons for each group. :param dt_unit: Currently supports {'Y', 'D', 'h', 'm', 's'}. 'W' is experimentally supported. :param fixed: If True, then the seasonality does not vary over time, and this amounts to one-hot-encoding the seasons. Default False. """ # self.seasonal_period = seasonal_period self.season_duration = season_duration self.fixed = fixed if dt_unit is None: # optional for some seasonal processes, but not for this one raise TypeError(f"Must pass `dt_unit` to {type(self).__name__}") self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=season_start) # state-elements: pad_n = len(str(seasonal_period)) super().__init__(id=id, state_elements=[self.measured_name] + [zpad(i, pad_n) for i in range(1, seasonal_period)]) # transitions are placeholders, filled in w/batch for i, current in enumerate(self.state_elements): self._set_transition(from_element=current, to_element=current, value=0.) if i > 0: prev = self.state_elements[i - 1] self._set_transition(from_element=prev, to_element=current, value=0.) if i > 1: self._set_transition(from_element=prev, to_element=self.measured_name, value=0.) if decay: assert not isinstance( decay, bool ), "decay should be floats of bounds (or False for no decay)" assert decay[0] > 0. and decay[1] <= 1.0 self.decay = Bounded(*decay) else: self.decay = None def add_measure(self, measure: str) -> 'Season': self._set_measure(measure=measure, state_element='measured', value=1.0) return self def param_dict(self) -> ParameterDict: p = ParameterDict() if self.decay is not None: p['decay'] = self.decay.parameter return p @property def dynamic_state_elements(self) -> Sequence[str]: return [] if self.fixed else [self.measured_name] def for_batch(self, num_groups: int, num_timesteps: int, start_datetimes: Optional[np.ndarray] = None): if start_datetimes is not None: if len(start_datetimes) != num_groups or len( start_datetimes.shape) != 1: raise ValueError( f"Expected `start_datetimes` to be 1D array of length {num_groups}." ) for_batch = super().for_batch(num_groups=num_groups, num_timesteps=num_timesteps) if start_datetimes is None: if self._dt_helper.start_datetime: raise TypeError("Missing argument `start_datetimes`.") start_datetimes = np.zeros(num_groups) delta = self._dt_helper.make_delta_grid(start_datetimes, num_timesteps) in_transition = (delta % self.season_duration) == (self.season_duration - 1) transitions = { 'to_next_state': torch.from_numpy(in_transition.astype('float32')), 'from_measured_to_measured': torch.from_numpy( np.where(in_transition, -1., 1.).astype('float32')) } transitions['to_self'] = 1 - transitions['to_next_state'] transitions['to_measured'] = -transitions['to_next_state'] for k in transitions.keys(): transitions[k] = split_flat(transitions[k], dim=1, clone=True) if self.decay is not None: decay_value = self.decay.get_value() transitions[k] = [x * decay_value for x in transitions[k]] # this is convoluted, but the idea is to manipulate the transitions so that we use one less degree of freedom # than the number of seasons, by having the 'measured' state be equal to -sum(all others) for i in range(1, len(self.state_elements)): current = self.state_elements[i] prev = self.state_elements[i - 1] if prev == self.measured_name: # measured requires special-case to_measured = transitions['from_measured_to_measured'] else: to_measured = transitions['to_measured'] for_batch._adjust_transition( from_element=prev, to_element=current, adjustment=transitions['to_next_state']) for_batch._adjust_transition(from_element=prev, to_element=self.measured_name, adjustment=to_measured) # from state to itself: for_batch._adjust_transition(from_element=current, to_element=current, adjustment=transitions['to_self']) return for_batch def initial_state_means_for_batch( self, parameters: Parameter, num_groups: int, start_datetimes: Optional[np.ndarray] = None) -> Tensor: if start_datetimes is None: start_datetimes = np.zeros(num_groups) delta = self._dt_helper.make_delta_grid(start_datetimes, num_timesteps=1).squeeze(1) season_shift = (np.floor(delta / self.season_duration) % self.seasonal_period).astype('int') means = [ torch.cat([parameters[-shift:], parameters[:-shift]]) for shift in season_shift ] return torch.stack(means, 0)
def to_dataframe(self, dataset: Union[TimeSeriesDataset, dict], type: str = 'predictions', group_colname: str = 'group', time_colname: str = 'time', multi: float = 1.96) -> 'DataFrame': """ :param dataset: Either a TimeSeriesDataset, or a dictionary with 'start_times', 'group_names', & 'dt_unit' :param type: Either 'predictions' or 'components'. :param group_colname: Column-name for 'group' :param time_colname: Column-name for 'time' :param multi: Multiplier on std-dev for lower/upper CIs. Default 1.96. :return: A pandas DataFrame with group, 'time', 'measure', 'mean', 'lower', 'upper'. For type='components' additionally includes: 'process' and 'state_element'. """ from pandas import concat if isinstance(dataset, TimeSeriesDataset): batch_info = { 'start_times': dataset.start_times, 'group_names': dataset.group_names, 'named_tensors': {}, 'dt_unit': dataset.dt_unit } for measure_group, tensor in zip(dataset.measures, dataset.tensors): for i, measure in enumerate(measure_group): if measure in self.design.measures: batch_info['named_tensors'][measure] = tensor[..., [i]] missing = set(self.design.measures) - set(dataset.all_measures) if missing: raise ValueError( f"Some measures in the design aren't in the dataset.\n" f"Design: {missing}\nDataset: {dataset.all_measures}") elif isinstance(dataset, dict): batch_info = dataset else: raise TypeError( "Expected `batch` to be a TimeSeriesDataset, or a dictionary with 'start_times' and 'group_names'." ) dt_helper = DateTimeHelper(dt_unit=batch_info['dt_unit']) def _tensor_to_df(tens, measures): times = dt_helper.make_grid(batch_info['start_times'], tens.shape[1]) return TimeSeriesDataset.tensor_to_dataframe( tensor=tens, times=times, group_names=batch_info['group_names'], group_colname=group_colname, time_colname=time_colname, measures=measures) assert group_colname not in {'mean', 'lower', 'upper'} assert time_colname not in {'mean', 'lower', 'upper'} out = [] if type == 'predictions': stds = torch.diagonal(self.prediction_uncertainty, dim1=-1, dim2=-2).sqrt() for i, measure in enumerate(self.design.measures): # predicted: df_pred = _tensor_to_df(self.predictions[..., [i]], measures=['mean']) df_std = _tensor_to_df(stds[..., [i]], measures=['_std']) df = df_pred.merge(df_std, on=[group_colname, time_colname]) df['lower'] = df['mean'] - multi * df['_std'] df['upper'] = df['mean'] + multi * df.pop('_std') # actual: orig_tensor = batch_info.get('named_tensors', {}).get(measure, None) if orig_tensor is not None: df_actual = _tensor_to_df(orig_tensor, measures=['actual']) df = df.merge(df_actual, on=[group_colname, time_colname], how='left') out.append(df.assign(measure=measure)) elif type == 'components': # components: for (measure, process, state_element), (m, std) in self._components().items(): df = _tensor_to_df(torch.stack([m, std], 2), measures=['mean', '_std']) df['lower'] = df['mean'] - multi * df['_std'] df['upper'] = df['mean'] + multi * df.pop('_std') df['process'], df['state_element'], df[ 'measure'] = process, state_element, measure out.append(df) # residuals: named_tensors = batch_info.get('named_tensors', {}) for i, measure in enumerate(self.design.measures): orig_tensor = named_tensors.get(measure) predictions = self.predictions[..., [i]] if orig_tensor.shape[1] < predictions.shape[1]: orig_aligned = predictions.data.clone() orig_aligned[:] = float('nan') orig_aligned[:, 0:orig_tensor.shape[1], :] = orig_tensor else: orig_aligned = orig_tensor[:, 0:predictions.shape[1], :] df = _tensor_to_df(predictions - orig_aligned, ['mean']) df['process'], df['state_element'], df[ 'measure'] = 'residuals', 'residuals', measure out.append(df) else: raise ValueError( "Expected `type` to be 'predictions' or 'components'.") return concat(out, sort=True)