예제 #1
0
    def __init__(self,
                 *tensors: Tensor,
                 group_names: Sequence[Any],
                 start_times: Union[np.ndarray, Sequence],
                 measures: Sequence[Sequence[str]],
                 dt_unit: Optional[str]):

        if not isinstance(group_names, np.ndarray):
            group_names = np.array(group_names)

        assert len(group_names) == len(start_times)
        assert len(tensors) == len(measures)

        for i, (tensor, tensor_measures) in enumerate(zip(tensors, measures)):
            if len(tensor.shape) < 3:
                raise ValueError(f"Tensor {i} has < 3 dimensions")
            if tensor.shape[0] != len(group_names):
                raise ValueError(f"Tensor {i}'s first dimension has length != {len(group_names)}.")
            if tensor.shape[2] != len(tensor_measures):
                raise ValueError(f"Tensor {i}'s 3rd dimension has length != len({tensor_measures}).")

        self.measures = tuple(tuple(m) for m in measures)
        self.all_measures = tuple(itertools.chain.from_iterable(self.measures))
        self.group_names = group_names
        self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=None)
        self.start_times = self._dt_helper.validate_datetimes(start_times)
        self.dt_unit = dt_unit
        super().__init__(*tensors)
예제 #2
0
def fourier_model_mat(datetimes: np.ndarray,
                      K: int,
                      period: Union[np.timedelta64, str],
                      output_fmt: str = 'float64') -> np.ndarray:
    """
    :param datetimes: An array of datetimes.
    :param K: The expansion integer.
    :param period: Either a np.timedelta64, or one of {'weekly','yearly','daily'}
    :param start_datetime: A np.datetime64 on which to consider the season-start; useful for aligning (e.g) weekly
    seasons to start on Monday, or daily seasons to start on a particular hour. Default is first monday after epoch.
    :param output_fmt: A numpy dtype, or 'dataframe' to output a dataframe.
    :return: A numpy array (or dataframe) with the expanded fourier series.
    """
    # parse period:
    name = 'fourier'
    if isinstance(period, str):
        name = period
        if period == 'weekly':
            period = np.timedelta64(7, 'D')
        elif period == 'yearly':
            period = np.timedelta64(int(365.25 * 24), 'h')
        elif period == 'daily':
            period = np.timedelta64(24, 'h')
        else:
            raise ValueError("Unrecognized `period`.")

    period_int = period.view('int64')
    dt_helper = DateTimeHelper(dt_unit=np.datetime_data(period)[0])
    time = dt_helper.validate_datetimes(datetimes).view('int64')

    output_dataframe = (output_fmt.lower() == 'dataframe')
    if output_dataframe:
        output_fmt = 'float64'

    # fourier matrix:
    out_shape = tuple(datetimes.shape) + (K * 2, )
    out = np.empty(out_shape, dtype=output_fmt)
    columns = []
    for idx in range(K):
        k = idx + 1
        for is_cos in range(2):
            val = 2. * np.pi * k * time / period_int
            out[...,
                idx * 2 + is_cos] = np.sin(val) if is_cos == 0 else np.cos(val)
            columns.append(f"{name}_K{k}_{'cos' if is_cos else 'sin'}")

    if output_dataframe:
        if len(out_shape) > 2:
            raise ValueError(
                "Cannot output dataframe when input is 2+D array.")
        from pandas import DataFrame
        out = DataFrame(out, columns=columns)

    return out
예제 #3
0
    def __init__(self,
                 id: str,
                 seasonal_period: int,
                 season_duration: int = 1,
                 decay: Union[bool, Tuple[float, float]] = False,
                 dt_unit: Optional[str] = None,
                 fixed: bool = False):
        """
        :param id: Unique name for this process
        :param seasonal_period: The number of seasons (e.g. 7 for day_in_week).
        :param season_duration: The length of each season, default 1 time-step.
        :param decay: Optional (float,float) boundaries for decay (between 0 and 1). Analogous to dampening a trend --
        the state will revert to zero as we get further from the last observation. This can be useful if two processes
        are capturing the same seasonal pattern: one can be more flexible, but with decay have a tendency to revert to
        zero, while the other is less variable but extrapolates into the future.
        :param dt_unit: Currently supports {'Y', 'D', 'h', 'm', 's'}. 'W' is experimentally supported.
        :param fixed: If True, then the seasonality does not vary over time, and this amounts to one-hot-encoding the
        seasons. Default False.
        """

        #
        self.seasonal_period = seasonal_period
        self.season_duration = season_duration
        self.fixed = fixed

        if dt_unit is None:
            # optional for some seasonal processes, but not for this one
            raise TypeError(f"Must pass `dt_unit` to {type(self).__name__}")
        self._dt_helper = DateTimeHelper(dt_unit=dt_unit)

        # state-elements:
        pad_n = len(str(seasonal_period))
        super().__init__(
            id=id,
            state_elements=[self.measured_name] + [zpad(i, pad_n) for i in range(1, seasonal_period)]
        )

        # transitions are placeholders, filled in w/batch
        for i, current in enumerate(self.state_elements):
            self._set_transition(from_element=current, to_element=current, value=0.)
            if i > 0:
                prev = self.state_elements[i - 1]
                self._set_transition(from_element=prev, to_element=current, value=0.)
                if i > 1:
                    self._set_transition(from_element=prev, to_element=self.measured_name, value=0.)

        if decay:
            assert not isinstance(decay, bool), "decay should be floats of bounds (or False for no decay)"
            assert decay[0] > 0. and decay[1] <= 1.0
            self.decay = Bounded(*decay)
        else:
            self.decay = None
예제 #4
0
    def __init__(self,
                 id: str,
                 seasonal_period: Union[int, float],
                 K: Union[int, float],
                 decay: Union[bool, Tuple[float, float]] = False,
                 season_start: Optional[str] = None,
                 dt_unit: Optional[str] = None):

        # season structure:
        self.seasonal_period = seasonal_period
        if isinstance(K, float):
            assert K.is_integer()
        self.K = int(K)

        self.decay = None
        if decay:
            assert decay[0] > 0. and decay[1] <= 1.0
            self.decay = Bounded(*decay)

        state_elements, list_of_trans_kwargs = self._setup(decay=decay)

        super().__init__(id=id, state_elements=state_elements)

        self._dt_helper = DateTimeHelper(dt_unit=dt_unit,
                                         start_datetime=season_start)

        for trans_kwargs in list_of_trans_kwargs:
            self._set_transition(**trans_kwargs)
예제 #5
0
 def __init__(self,
              K: int,
              period: Union[np.timedelta64, str],
              dt_unit: str,
              num_outputs: int,
              start_datetime: Optional[np.datetime64] = None,
              bias: bool = False):
     self.K = K
     self.period = period
     self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=start_datetime)
     super().__init__(in_features=K * 2, out_features=num_outputs, bias=bias)
예제 #6
0
class TimeSeriesDataset(NiceRepr, TensorDataset):
    """
    TimeSeriesDataset includes additional information about each of the Tensors' dimensions: the name for each group in
    the first dimension, the start (date)time (and optionally datetime-unit) for the second dimension, and the name of
    the measures for the third dimension.

    Note that unlike TensorDataset, indexing a TimeSeriesDataset returns another TimeSeriesDataset, not a tuple of
    tensors. So when using TimeSeriesDataset, use `TimeSeriesDataLoader` (or just use
    `DataLoader(collate_fn=TimeSeriesDataset.collate)`).
    """
    supported_dt_units = {'Y', 'D', 'h', 'm', 's'}
    _repr_attrs = ('sizes', 'measures')

    def __init__(self,
                 *tensors: Tensor,
                 group_names: Sequence[Any],
                 start_times: Union[np.ndarray, Sequence],
                 measures: Sequence[Sequence[str]],
                 dt_unit: Optional[str]):

        if not isinstance(group_names, np.ndarray):
            group_names = np.array(group_names)

        assert len(group_names) == len(start_times)
        assert len(tensors) == len(measures)

        for i, (tensor, tensor_measures) in enumerate(zip(tensors, measures)):
            if len(tensor.shape) < 3:
                raise ValueError(f"Tensor {i} has < 3 dimensions")
            if tensor.shape[0] != len(group_names):
                raise ValueError(f"Tensor {i}'s first dimension has length != {len(group_names)}.")
            if tensor.shape[2] != len(tensor_measures):
                raise ValueError(f"Tensor {i}'s 3rd dimension has length != len({tensor_measures}).")

        self.measures = tuple(tuple(m) for m in measures)
        self.all_measures = tuple(itertools.chain.from_iterable(self.measures))
        self.group_names = group_names
        self._dt_helper = DateTimeHelper(dt_unit=dt_unit, start_datetime=None)
        self.start_times = self._dt_helper.validate_datetimes(start_times)
        self.dt_unit = dt_unit
        super().__init__(*tensors)

    @property
    def sizes(self) -> Sequence:
        return [t.size() for t in self.tensors]

    # Subsetting ------------------------:
    def train_val_split(self,
                        train_frac: float = None,
                        dt: np.datetime64 = None) -> Tuple['TimeSeriesDataset', 'TimeSeriesDataset']:
        """
        :param train_frac: The proportion of the data to keep for training. This is calculated on a per-group basis, by
        taking the last observation for each group (i.e., the last observation that a non-nan value on any measure). If
        neither `train_frac` nor `dt` are passed, `train_frac=.75` is used.
        :param dt: A datetime to use in dividing train/validation (first datetime for validation).
        :return: Two TimeSeriesDatasets, one with data before the split, the other with >= the split.
        """

        # get split times:
        if dt is None:
            if train_frac is None:
                train_frac = .75
            assert 0 < train_frac < 1
            # for each group, find the last non-nan, take `frac` of that to find the train/val split point:
            split_idx = np.array([int(idx * train_frac) for idx in self._last_measured_idx()], dtype='int')
            split_times = np.take(self.times(0), split_idx)
        else:
            if train_frac is not None:
                raise TypeError("Can pass only one of `train_frac`, `dt`.")
            if not isinstance(dt, np.datetime64):
                dt = np.datetime64(dt, self.dt_unit)
            split_times = np.full(shape=len(self.group_names), fill_value=dt)

        # val:
        val_dataset = self.with_new_start_times(split_times)

        # train:
        train_tensors = []
        for i, tens in enumerate(self.tensors):
            train = tens.data.clone()
            train[np.where(self.times(i) >= split_times[:, None])] = float('nan')
            not_all_nan = (~torch.isnan(train)).sum((0, 2))
            last_good_idx = true1d_idx(not_all_nan).max()
            train = train[:, :(last_good_idx + 1), :]
            train_tensors.append(train)
        train_dataset = self.with_new_tensors(*train_tensors)

        return train_dataset, val_dataset

    def with_new_start_times(self, start_times: Union[np.ndarray, Sequence]) -> 'TimeSeriesDataset':
        """
        Subset a TimeSeriesDataset so that some/all of the groups have later start times.

        :param start_times: An array/list of new datetimes.
        :return: A new TimeSeriesDataset.
        """
        new_tensors = []
        for i, tens in enumerate(self.tensors):
            times = self.times(i)
            new_tens = []
            for g, (new_time, old_times) in enumerate(zip(start_times, times)):
                if (old_times <= new_time).all():
                    raise ValueError(f"{new_time} is later than all the times for group {self.group_names[g]}")
                elif (old_times > new_time).all():
                    raise ValueError(f"{new_time} is earlier than all the times for group {self.group_names[g]}")
                new_tens.append(tens[g, true1d_idx(old_times >= new_time), :].unsqueeze(0))
            new_tens = ragged_cat(new_tens, ragged_dim=1, cat_dim=0)
            new_tensors.append(new_tens)
        return type(self)(
            *new_tensors,
            group_names=self.group_names,
            start_times=start_times,
            measures=self.measures,
            dt_unit=self.dt_unit
        )

    def get_groups(self, groups: Sequence[Any]) -> 'TimeSeriesDataset':
        """
        Get the subset of the batch corresponding to groups. Note that the ordering in the output will match the
        original ordering (not that of `group`), and that duplicates will be dropped.
        """
        group_idx = true1d_idx(np.isin(self.group_names, groups))
        return self[group_idx]

    def split_measures(self, *measure_groups) -> 'TimeSeriesDataset':
        """
        Take a dataset with one tensor, split it into a dataset with multiple tensors.

        :param measure_groups: Each argument should be be a list of measure-names, or an indexer (i.e. list of ints or
        a slice).
        :return: A TimeSeriesDataset, now with multiple tensors for the measure-groups
        """
        if len(self.measures) > 1:
            raise RuntimeError(f"Can only split measures if there's only one group, but instead:\n{self.measures}")
        self_tensor = self.tensors[0]
        self_measures = self.measures[0]

        idxs = []
        for measure_group in measure_groups:
            if isinstance(measure_group, slice) or isinstance(measure_group[0], int):
                idxs.append(measure_group)
            else:
                idxs.append([self_measures.index(m) for m in measure_group])

        self_measures = np.array(self_measures)
        return type(self)(
            *(self_tensor[:, :, idx] for idx in idxs),
            start_times=self.start_times,
            group_names=self.group_names,
            measures=[tuple(self_measures[idx]) for idx in idxs],
            dt_unit=self.dt_unit
        )

    def __getitem__(self, item: Union[int, Sequence, slice]) -> 'TimeSeriesDataset':
        if isinstance(item, int):
            item = [item]
        return type(self)(
            *super(TimeSeriesDataset, self).__getitem__(item),
            group_names=self.group_names[item],
            start_times=self.start_times[item],
            measures=self.measures,
            dt_unit=self.dt_unit
        )

    # Creation/Transformation ------------------------:
    @classmethod
    def collate(cls, batch: Sequence['TimeSeriesDataset']) -> 'TimeSeriesDataset':

        to_concat = {
            'tensors': [batch[0].tensors],
            'group_names': [batch[0].group_names],
            'start_times': [batch[0].start_times]
        }
        fixed = {'dt_unit': batch[0].dt_unit, 'measures': batch[0].measures}
        for i, ts_dataset in enumerate(batch[1:], 1):
            for attr, appendlist in to_concat.items():
                to_concat[attr].append(getattr(ts_dataset, attr))
            for attr, required_val in fixed.items():
                new_val = getattr(ts_dataset, attr)
                if new_val != required_val:
                    raise ValueError(f"Element {i} has `{attr}` = {new_val}, but for element 0 it's {required_val}.")

        tensors = tuple(ragged_cat(t, ragged_dim=1) for t in zip(*to_concat['tensors']))

        return cls(
            *tensors,
            group_names=np.concatenate(to_concat['group_names']),
            start_times=np.concatenate(to_concat['start_times']),
            measures=fixed['measures'],
            dt_unit=fixed['dt_unit']
        )

    def to_dataframe(self,
                     group_colname: str = 'group',
                     time_colname: str = 'time'
                     ) -> 'DataFrame':

        return self.tensor_to_dataframe(
            tensor=torch.cat(self.tensors, 2),
            times=self.times(),
            group_names=self.group_names,
            group_colname=group_colname,
            time_colname=time_colname,
            measures=self.all_measures
        )

    @staticmethod
    def tensor_to_dataframe(tensor: Tensor,
                            times: np.ndarray,
                            group_names: Sequence,
                            group_colname: str,
                            time_colname: str,
                            measures: Sequence[str]) -> 'DataFrame':
        from pandas import DataFrame, concat

        tensor = tensor.data.numpy()
        assert tensor.shape[0] == len(group_names)
        assert tensor.shape[0] == len(times)
        assert tensor.shape[1] <= times.shape[1]
        assert tensor.shape[2] == len(measures)

        dfs = []
        for g, group_name in enumerate(group_names):
            # get values, don't store trailing nans:
            values = tensor[g]
            all_nan_per_row = np.min(np.isnan(values), axis=1)
            if all_nan_per_row.all():
                warn(f"Group {group_name} has only missing values.")
                continue
            end_idx = true1d_idx(~all_nan_per_row).max() + 1
            # convert to dataframe:
            df = DataFrame(data=values[:end_idx, :], columns=measures)
            df[group_colname] = group_name
            df[time_colname] = np.nan
            df[time_colname] = times[g, 0:len(df.index)]
            dfs.append(df)

        return concat(dfs)

    @classmethod
    def from_dataframe(cls,
                       dataframe: 'DataFrame',
                       group_colname: str,
                       time_colname: str,
                       dt_unit: Optional[str],
                       measure_colnames: Optional[Sequence[str]] = None,
                       X_colnames: Optional[Sequence[str]] = None,
                       y_colnames: Optional[Sequence[str]] = None) -> 'TimeSeriesDataset':

        if measure_colnames is None:
            if X_colnames is None or y_colnames is None:
                raise ValueError("Must pass either `measure_colnames` or `X_colnames` & `y_colnames`")
            measure_colnames = list(y_colnames) + list(X_colnames)
        else:
            if X_colnames is not None or y_colnames is not None:
                raise ValueError("If passing `measure_colnames` do not pass `X_colnames` or `y_colnames`.")

        assert isinstance(group_colname, str)
        assert isinstance(time_colname, str)
        assert len(measure_colnames) == len(set(measure_colnames))

        # sort by time:
        dataframe = dataframe.sort_values(time_colname)

        for measure_colname in measure_colnames:
            if measure_colname not in dataframe.columns:
                raise ValueError(f"'{measure_colname}' not in dataframe.columns:\n{dataframe.columns}'")

        # first pass for info:
        arrays, time_idxs, group_names, start_times = [], [], [], []
        for g, df in dataframe.groupby(group_colname, sort=True):
            # group-names:
            group_names.append(g)

            # times:
            times = df[time_colname].values
            assert len(times) == len(set(times)), f"Group {g} has duplicate times"
            min_time = times[0]
            start_times.append(min_time)
            if dt_unit is None:
                time_idx = (times - min_time).astype('int64')
            else:
                time_idx = (times - min_time).astype(f'timedelta64[{dt_unit}]').view('int64')
            time_idxs.append(time_idx)

            # values:
            arrays.append(df.loc[:, measure_colnames].values)

        # second pass organizes into tensor
        time_len = max(time_idx[-1] + 1 for time_idx in time_idxs)
        tens = torch.empty((len(arrays), time_len, len(measure_colnames)))
        tens[:] = np.nan
        for i, (array, time_idx) in enumerate(zip(arrays, time_idxs)):
            tens[i, time_idx, :] = Tensor(array)

        dataset = cls(
            tens,
            group_names=group_names,
            start_times=start_times,
            measures=[measure_colnames],
            dt_unit=dt_unit
        )

        if X_colnames is not None:
            dataset = dataset.split_measures(y_colnames, X_colnames)
            y, X = dataset.tensors
            # don't use nan-padding on the predictor tensor:
            for i, time_idx in enumerate(time_idxs):
                X[:, time_idx.max():, :] = 0.0

        return dataset

    def with_new_tensors(self, *tensors: Tensor) -> 'TimeSeriesDataset':
        """
        Create a new Batch with a different Tensor, but all other attributes the same.
        """
        return type(self)(
            *tensors,
            group_names=self.group_names,
            start_times=self.start_times,
            measures=self.measures,
            dt_unit=self.dt_unit
        )

    # Util/Private ------------------------
    def times(self, which: Optional[int] = None) -> np.ndarray:
        """
        A 2D array of datetimes (or integers if dt_unit is None) for this dataset.

        :param which: If this dataset has multiple tensors of different number of timesteps, which should be used for
        constructing the `times` array? Defaults to the one with the most timesteps.
        :return: A 2D numpy array of datetimes (or integers if dt_unit is None).
        """
        if which is None:
            num_timesteps = max(tensor.shape[1] for tensor in self.tensors)
        else:
            num_timesteps = self.tensors[which].shape[1]
        return self._dt_helper.make_grid(self.start_times, num_timesteps)

    def datetimes(self) -> np.ndarray:
        return self.times()

    @property
    def start_datetimes(self) -> np.ndarray:
        return self.start_times

    def last_measured_times(self) -> np.ndarray:
        """
        :return: The datetimes (or integers if dt_unit is None) for the last measurement in the first tensor, where a
        measurement is any non-nan value in at least one dimension.
        """
        times = self.times(which=0)
        last_measured_idx = self._last_measured_idx()
        return np.array([t[idx] for t, idx in zip(times, last_measured_idx)], dtype=f'datetime64[{self.dt_unit}]')

    def _last_measured_idx(self) -> np.ndarray:
        """
        :return: The indices of the last measurement in the first tensor, where a measurement is any non-nan value in at
         least on dimension.
        """
        tens, *_ = self.tensors
        any_measured_bool = ~np.isnan(tens.numpy()).all(2)
        last_measured_idx = np.array(
            [np.max(true1d_idx(any_measured_bool[g]), initial=0) for g in range(len(self.group_names))],
            dtype='int'
        )
        return last_measured_idx
예제 #7
0
class Season(Process):
    """
    Process representing discrete seasons.
    """
    measured_name = 'measured'

    def __init__(self,
                 id: str,
                 seasonal_period: int,
                 season_duration: int = 1,
                 decay: Union[bool, Tuple[float, float]] = False,
                 season_start: Optional[str] = None,
                 dt_unit: Optional[str] = None,
                 fixed: bool = False):
        """
        :param id: Unique name for this process
        :param seasonal_period: The number of seasons (e.g. 7 for day_in_week).
        :param season_duration: The length of each season, default 1 time-step.
        :param decay: Optional (float,float) boundaries for decay (between 0 and 1). Analogous to dampening a trend --
        the state will revert to zero as we get further from the last observation. This can be useful if two processes
        are capturing the same seasonal pattern: one can be more flexible, but with decay have a tendency to revert to
        zero, while the other is less variable but extrapolates into the future.
        :param season_start:  A string that can be parsed into a datetime by `numpy.datetime64`. This is when the season
        starts, which is useful to specify if season boundaries are meaningful. It is important to specify if different
        groups in your dataset start on different dates; when calling the kalman-filter you'll pass an array of
        `start_datetimes` for group in the input, and this will be used to align the seasons for each group.
        :param dt_unit: Currently supports {'Y', 'D', 'h', 'm', 's'}. 'W' is experimentally supported.
        :param fixed: If True, then the seasonality does not vary over time, and this amounts to one-hot-encoding the
        seasons. Default False.
        """

        #
        self.seasonal_period = seasonal_period
        self.season_duration = season_duration
        self.fixed = fixed

        if dt_unit is None:
            # optional for some seasonal processes, but not for this one
            raise TypeError(f"Must pass `dt_unit` to {type(self).__name__}")
        self._dt_helper = DateTimeHelper(dt_unit=dt_unit,
                                         start_datetime=season_start)

        # state-elements:
        pad_n = len(str(seasonal_period))
        super().__init__(id=id,
                         state_elements=[self.measured_name] +
                         [zpad(i, pad_n) for i in range(1, seasonal_period)])

        # transitions are placeholders, filled in w/batch
        for i, current in enumerate(self.state_elements):
            self._set_transition(from_element=current,
                                 to_element=current,
                                 value=0.)
            if i > 0:
                prev = self.state_elements[i - 1]
                self._set_transition(from_element=prev,
                                     to_element=current,
                                     value=0.)
                if i > 1:
                    self._set_transition(from_element=prev,
                                         to_element=self.measured_name,
                                         value=0.)

        if decay:
            assert not isinstance(
                decay, bool
            ), "decay should be floats of bounds (or False for no decay)"
            assert decay[0] > 0. and decay[1] <= 1.0
            self.decay = Bounded(*decay)
        else:
            self.decay = None

    def add_measure(self, measure: str) -> 'Season':
        self._set_measure(measure=measure, state_element='measured', value=1.0)
        return self

    def param_dict(self) -> ParameterDict:
        p = ParameterDict()
        if self.decay is not None:
            p['decay'] = self.decay.parameter
        return p

    @property
    def dynamic_state_elements(self) -> Sequence[str]:
        return [] if self.fixed else [self.measured_name]

    def for_batch(self,
                  num_groups: int,
                  num_timesteps: int,
                  start_datetimes: Optional[np.ndarray] = None):

        if start_datetimes is not None:
            if len(start_datetimes) != num_groups or len(
                    start_datetimes.shape) != 1:
                raise ValueError(
                    f"Expected `start_datetimes` to be 1D array of length {num_groups}."
                )

        for_batch = super().for_batch(num_groups=num_groups,
                                      num_timesteps=num_timesteps)

        if start_datetimes is None:
            if self._dt_helper.start_datetime:
                raise TypeError("Missing argument `start_datetimes`.")
            start_datetimes = np.zeros(num_groups)
        delta = self._dt_helper.make_delta_grid(start_datetimes, num_timesteps)

        in_transition = (delta %
                         self.season_duration) == (self.season_duration - 1)

        transitions = {
            'to_next_state':
            torch.from_numpy(in_transition.astype('float32')),
            'from_measured_to_measured':
            torch.from_numpy(
                np.where(in_transition, -1., 1.).astype('float32'))
        }
        transitions['to_self'] = 1 - transitions['to_next_state']
        transitions['to_measured'] = -transitions['to_next_state']

        for k in transitions.keys():
            transitions[k] = split_flat(transitions[k], dim=1, clone=True)
            if self.decay is not None:
                decay_value = self.decay.get_value()
                transitions[k] = [x * decay_value for x in transitions[k]]

        # this is convoluted, but the idea is to manipulate the transitions so that we use one less degree of freedom
        # than the number of seasons, by having the 'measured' state be equal to -sum(all others)
        for i in range(1, len(self.state_elements)):
            current = self.state_elements[i]
            prev = self.state_elements[i - 1]

            if prev == self.measured_name:  # measured requires special-case
                to_measured = transitions['from_measured_to_measured']
            else:
                to_measured = transitions['to_measured']

            for_batch._adjust_transition(
                from_element=prev,
                to_element=current,
                adjustment=transitions['to_next_state'])
            for_batch._adjust_transition(from_element=prev,
                                         to_element=self.measured_name,
                                         adjustment=to_measured)

            # from state to itself:
            for_batch._adjust_transition(from_element=current,
                                         to_element=current,
                                         adjustment=transitions['to_self'])

        return for_batch

    def initial_state_means_for_batch(
            self,
            parameters: Parameter,
            num_groups: int,
            start_datetimes: Optional[np.ndarray] = None) -> Tensor:
        if start_datetimes is None:
            start_datetimes = np.zeros(num_groups)
        delta = self._dt_helper.make_delta_grid(start_datetimes,
                                                num_timesteps=1).squeeze(1)
        season_shift = (np.floor(delta / self.season_duration) %
                        self.seasonal_period).astype('int')
        means = [
            torch.cat([parameters[-shift:], parameters[:-shift]])
            for shift in season_shift
        ]
        return torch.stack(means, 0)
예제 #8
0
    def to_dataframe(self,
                     dataset: Union[TimeSeriesDataset, dict],
                     type: str = 'predictions',
                     group_colname: str = 'group',
                     time_colname: str = 'time',
                     multi: float = 1.96) -> 'DataFrame':
        """
        :param dataset: Either a TimeSeriesDataset, or a dictionary with 'start_times', 'group_names', & 'dt_unit'
        :param type: Either 'predictions' or 'components'.
        :param group_colname: Column-name for 'group'
        :param time_colname: Column-name for 'time'
        :param multi: Multiplier on std-dev for lower/upper CIs. Default 1.96.
        :return: A pandas DataFrame with group, 'time', 'measure', 'mean', 'lower', 'upper'. For type='components'
        additionally includes: 'process' and 'state_element'.
        """

        from pandas import concat

        if isinstance(dataset, TimeSeriesDataset):
            batch_info = {
                'start_times': dataset.start_times,
                'group_names': dataset.group_names,
                'named_tensors': {},
                'dt_unit': dataset.dt_unit
            }
            for measure_group, tensor in zip(dataset.measures,
                                             dataset.tensors):
                for i, measure in enumerate(measure_group):
                    if measure in self.design.measures:
                        batch_info['named_tensors'][measure] = tensor[..., [i]]
            missing = set(self.design.measures) - set(dataset.all_measures)
            if missing:
                raise ValueError(
                    f"Some measures in the design aren't in the dataset.\n"
                    f"Design: {missing}\nDataset: {dataset.all_measures}")
        elif isinstance(dataset, dict):
            batch_info = dataset
        else:
            raise TypeError(
                "Expected `batch` to be a TimeSeriesDataset, or a dictionary with 'start_times' and 'group_names'."
            )

        dt_helper = DateTimeHelper(dt_unit=batch_info['dt_unit'])

        def _tensor_to_df(tens, measures):
            times = dt_helper.make_grid(batch_info['start_times'],
                                        tens.shape[1])
            return TimeSeriesDataset.tensor_to_dataframe(
                tensor=tens,
                times=times,
                group_names=batch_info['group_names'],
                group_colname=group_colname,
                time_colname=time_colname,
                measures=measures)

        assert group_colname not in {'mean', 'lower', 'upper'}
        assert time_colname not in {'mean', 'lower', 'upper'}

        out = []
        if type == 'predictions':

            stds = torch.diagonal(self.prediction_uncertainty,
                                  dim1=-1,
                                  dim2=-2).sqrt()
            for i, measure in enumerate(self.design.measures):
                # predicted:
                df_pred = _tensor_to_df(self.predictions[..., [i]],
                                        measures=['mean'])
                df_std = _tensor_to_df(stds[..., [i]], measures=['_std'])
                df = df_pred.merge(df_std, on=[group_colname, time_colname])
                df['lower'] = df['mean'] - multi * df['_std']
                df['upper'] = df['mean'] + multi * df.pop('_std')

                # actual:
                orig_tensor = batch_info.get('named_tensors',
                                             {}).get(measure, None)
                if orig_tensor is not None:
                    df_actual = _tensor_to_df(orig_tensor, measures=['actual'])
                    df = df.merge(df_actual,
                                  on=[group_colname, time_colname],
                                  how='left')

                out.append(df.assign(measure=measure))

        elif type == 'components':
            # components:
            for (measure, process,
                 state_element), (m, std) in self._components().items():
                df = _tensor_to_df(torch.stack([m, std], 2),
                                   measures=['mean', '_std'])
                df['lower'] = df['mean'] - multi * df['_std']
                df['upper'] = df['mean'] + multi * df.pop('_std')
                df['process'], df['state_element'], df[
                    'measure'] = process, state_element, measure
                out.append(df)

            # residuals:
            named_tensors = batch_info.get('named_tensors', {})
            for i, measure in enumerate(self.design.measures):
                orig_tensor = named_tensors.get(measure)
                predictions = self.predictions[..., [i]]
                if orig_tensor.shape[1] < predictions.shape[1]:
                    orig_aligned = predictions.data.clone()
                    orig_aligned[:] = float('nan')
                    orig_aligned[:, 0:orig_tensor.shape[1], :] = orig_tensor
                else:
                    orig_aligned = orig_tensor[:, 0:predictions.shape[1], :]

                df = _tensor_to_df(predictions - orig_aligned, ['mean'])
                df['process'], df['state_element'], df[
                    'measure'] = 'residuals', 'residuals', measure
                out.append(df)

        else:
            raise ValueError(
                "Expected `type` to be 'predictions' or 'components'.")

        return concat(out, sort=True)