예제 #1
0
def simulate_daily_series(num_groups: int, num_timesteps: int, noise: float = 1.0) -> 'DataFrame':
    # create realistic series:
    tensor = _simulate(num_groups, num_timesteps, noise=noise, dt_unit='D')

    # convert to dataset:
    dataset = TimeSeriesDataset(
        tensor,
        group_names=range(num_groups),
        start_times=[DEFAULT_START_DT] * num_groups,
        measures=[['y']],
        dt_unit='D'
    )
    # convert to dataframe:
    df = dataset.to_dataframe()

    # add predictors:
    # TODO: meaningful predictors
    df['X1'] = np.random.normal(size=len(df.index))
    df['X2'] = np.random.normal(size=len(df.index))

    # make number of timesteps per group non-uniform:
    max_timestep_per_group = dict(zip(
        range(num_groups),
        np.random.choice(range(int(num_timesteps * .80), num_timesteps), size=num_groups, replace=True)
    ))
    df['_max_time'] = DEFAULT_START_DT + df['group'].map(max_timestep_per_group)
    df = df.loc[df['time'] <= df.pop('_max_time'), :].reset_index(drop=True)

    return df
예제 #2
0
def simulate_daily_series(num_groups: int, num_timesteps: int, noise: float = 1.0) -> 'DataFrame':
    season_spec = {
        'season_start': np.datetime64('2007-01-01'),  # arbitrary monday at midnight
        'dt_unit': 'D'
    }
    # create realistic series:
    tensor = _simulate(num_groups, num_timesteps, season_spec, noise=noise)

    # convert to dataset:
    dataset = TimeSeriesDataset(
        tensor,
        group_names=range(num_groups),
        start_times=[season_spec['season_start']] * num_groups,
        measures=[['y']],
        dt_unit=season_spec['dt_unit']
    )
    # convert to dataframe:
    df = dataset.to_dataframe()

    # add predictors:
    # TODO: meaningful predictors
    df['X1'] = np.random.normal(size=len(df.index))
    df['X2'] = np.random.normal(size=len(df.index))

    # make number of timesteps per group non-uniform:
    max_timestep_per_group = dict(zip(
        range(num_groups),
        np.random.choice(range(int(num_timesteps * .80), num_timesteps), size=num_groups, replace=True)
    ))
    df['_max_time'] = season_spec['season_start'] + df['group'].map(max_timestep_per_group)
    df = df.loc[df['time'] <= df.pop('_max_time'), :].reset_index(drop=True)

    return df
예제 #3
0
    def test_last_measured_idx(self):
        tens = torch.zeros((3, 10, 2))

        # first group 4
        tens[0, 5:, :] = float('nan')

        # second group 7:
        tens[1, 5:, 0] = float('nan')
        tens[1, 8:, 1] = float('nan')

        # third group end:
        tens[2, 8, :] = float('nan')

        d = TimeSeriesDataset(tens, group_names=range(3), start_times=[0] * 3, measures=[['x', 'y']], dt_unit=None)
        last_measured = d._last_measured_idx()

        self.assertEqual(last_measured[0], 4)
        self.assertEqual(last_measured[1], 7)
        self.assertEqual(last_measured[2], 9)
예제 #4
0
 def _tensor_to_df(tens, measures):
     times = batch_info.get('times', batch_info['start_times'][:, None] + np.arange(0, tens.shape[1]))
     return TimeSeriesDataset.tensor_to_dataframe(
         tensor=tens,
         times=times,
         group_names=batch_info['group_names'],
         group_colname=group_colname,
         time_colname=time_colname,
         measures=measures
     )
예제 #5
0
 def _tensor_to_df(tens, measures):
     times = dt_helper.make_grid(batch_info['start_times'],
                                 tens.shape[1])
     return TimeSeriesDataset.tensor_to_dataframe(
         tensor=tens,
         times=times,
         group_names=batch_info['group_names'],
         group_colname=group_colname,
         time_colname=time_colname,
         measures=measures)
예제 #6
0
        def _tensor_to_df(tens, measures):
            offsets = np.arange(0, tens.shape[1]) * (
                batch_info['dt_unit'] if batch_info['dt_unit'] else 1)
            times = batch_info['start_times'][:, None] + offsets

            return TimeSeriesDataset.tensor_to_dataframe(
                tensor=tens,
                times=times,
                group_names=batch_info['group_names'],
                group_colname=group_colname,
                time_colname=time_colname,
                measures=measures)
예제 #7
0
    def test_time_series_dataset(self):
        values = torch.randn((3, 39, 2))

        batch = TimeSeriesDataset(
            values,
            group_names=['one', 'two', 'three'],
            start_times=[0, 0, 0],
            measures=[['y1', 'y2']],
            dt_unit=None
        )
        try:
            import pandas as pd
        except ImportError:
            warn("Not testing TimeSeriesDataset.to_dataframe, pandas not installed.")
            return
        df1 = batch.to_dataframe()

        df2 = pd.concat([
            pd.DataFrame(values[i].numpy(), columns=batch.all_measures).assign(group=group, time=batch.times()[0])
            for i, group in enumerate(batch.group_names)
        ])
        self.assertTrue((df1 == df2).all().all())
예제 #8
0
    def test_equations_preds(self, n_step: int):
        from torch_kalman.utils.data import TimeSeriesDataset
        from pandas import DataFrame

        class LinearModelFixed(LinearModel):
            def __init__(self, *args, **kwargs):
                super().__init__(*args, **kwargs)
                self.no_icov_state_elements = self.state_elements

        kf = KalmanFilter(
            processes=[LinearModelFixed(id='lm', predictors=['x1', 'x2'])],
            measures=['y'],
            compiled=False)
        kf.script_module._scale_by_measure_var = False
        kf.state_dict(
        )['script_module.processes.lm.init_mean'][:] = torch.tensor(
            [1.5, -0.5])
        kf.state_dict(
        )['script_module.measure_covariance.cholesky_log_diag'][0] = np.log(
            .1**.5)

        num_times = 100
        df = DataFrame({
            'x1': np.random.randn(num_times),
            'x2': np.random.randn(num_times)
        })
        df['y'] = 1.5 * df['x1'] + -.5 * df['x2'] + .1 * np.random.randn(
            num_times)
        df['time'] = df.index.values
        df['group'] = '1'
        dataset = TimeSeriesDataset.from_dataframe(dataframe=df,
                                                   group_colname='group',
                                                   time_colname='time',
                                                   dt_unit=None,
                                                   X_colnames=['x1', 'x2'],
                                                   y_colnames=['y'])
        y, X = dataset.tensors

        from pandas import Series

        pred = kf(y, X=X, out_timesteps=X.shape[1], n_step=n_step)
        y_series = Series(y.squeeze().numpy())
        for shift in range(-2, 3):
            resid = y_series.shift(shift) - Series(
                pred.means.squeeze().numpy())
            if shift:
                # check there's no misalignment in internal n_step logic (i.e., realigning the input makes things worse)
                self.assertGreater((resid**2).mean(), 1.)
            else:
                self.assertLess((resid**2).mean(), .02)
예제 #9
0
# + {"hidePrompt": true, "cell_type": "markdown"}
# #### Prepare our Dataset
#
# One of the key advantages of `torch-kalman` is the ability to train on a batch of time-serieses, instead of training a separate model for each individually. The `TimeSeriesDataset` is similar to PyTorch's native `TensorDataset`, with some useful metadata on the batch of time-serieses (the station names, the dates for each).

# +
# preprocess our measures of interest:
measures = ['SO2', 'PM10']
measures_pp = [m + '_log10_scaled' for m in measures]
df_aq_weekly[measures_pp] = np.log10(df_aq_weekly[measures] /
                                     col_means[measures])

# create a dataset:
dataset_all = TimeSeriesDataset.from_dataframe(dataframe=df_aq_weekly,
                                               dt_unit='W',
                                               measure_colnames=measures_pp,
                                               group_colname='station',
                                               time_colname='date')

# Train/Val split:
dataset_train, dataset_val = dataset_all.train_val_split(dt=SPLIT_DT)
dataset_train, dataset_val
# -

# #### Specify our Model
#
# The `KalmanFilter` subclasses `torch.nn.Module`. We specify the model by passing `processes` that capture the behaviors of our `measures`.

processes = []
for measure in measures_pp:
    processes.extend([
예제 #10
0
    def test_training3(self):
        """
        Test TBATS and TimeSeriesDataset integration
        """
        try:
            import pandas as pd
        except ImportError:  # not a package requirement
            return
        torch.manual_seed(123)
        df = pd.DataFrame({
            'sin':
            np.sin(2. * 3.1415 * np.arange(0., 5 * 7.) / 7.),
            'cos':
            np.cos(2. * 3.1415 * np.arange(0., 5 * 7.) / 7.)
        })
        df['y'] = df['cos'].where(df.index < 12, other=df['sin'])

        df = pd.concat([
            df.assign(observed=lambda df: df['y'] + np.random.normal(
                scale=.2, size=len(df.index)),
                      group=str(i + 1),
                      time=lambda df: np.array(df.index.tolist(),
                                               dtype='datetime64[D]') + np.
                      random.randint(low=0, high=4)) for i in range(10)
        ])
        dataset = TimeSeriesDataset.from_dataframe(df,
                                                   group_colname='group',
                                                   time_colname='time',
                                                   dt_unit='D',
                                                   measure_colnames=['y'])

        def _train(num_epochs: int = 15):
            kf = KalmanFilter(processes=[
                TBATS(id='day_of_week',
                      period=7,
                      dt_unit='D',
                      K=1,
                      process_variance=True,
                      decay=(.85, 1.))
            ],
                              measures=['y'])

            # train:
            optimizer = torch.optim.LBFGS(kf.parameters(), lr=.15, max_iter=10)

            def closure():
                optimizer.zero_grad()
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    pred = kf(dataset.tensors[0],
                              start_datetimes=dataset.start_datetimes)
                loss = -pred.log_prob(dataset.tensors[0]).mean()
                loss.backward()
                return loss

            print(f"\nTraining for {num_epochs} epochs...")
            for i in range(num_epochs):
                loss = optimizer.step(closure)
                print("loss:", loss.item())

            return kf

        kf = None
        for i in range(MAX_TRIES):
            try:
                kf = _train()
            except RuntimeError as e:
                if 'cholesky' not in str(e):
                    raise e
            if kf is not None:
                break
        if kf is None:
            raise RuntimeError("MAX_TRIES exceeded")

        with torch.no_grad():
            pred = kf(dataset.tensors[0],
                      start_datetimes=dataset.start_datetimes)
        df_pred = pred.to_dataframe(dataset)
        self.assertLess(np.mean((df_pred['actual'] - df_pred['mean'])**2), .05)