def simulate_daily_series(num_groups: int, num_timesteps: int, noise: float = 1.0) -> 'DataFrame': # create realistic series: tensor = _simulate(num_groups, num_timesteps, noise=noise, dt_unit='D') # convert to dataset: dataset = TimeSeriesDataset( tensor, group_names=range(num_groups), start_times=[DEFAULT_START_DT] * num_groups, measures=[['y']], dt_unit='D' ) # convert to dataframe: df = dataset.to_dataframe() # add predictors: # TODO: meaningful predictors df['X1'] = np.random.normal(size=len(df.index)) df['X2'] = np.random.normal(size=len(df.index)) # make number of timesteps per group non-uniform: max_timestep_per_group = dict(zip( range(num_groups), np.random.choice(range(int(num_timesteps * .80), num_timesteps), size=num_groups, replace=True) )) df['_max_time'] = DEFAULT_START_DT + df['group'].map(max_timestep_per_group) df = df.loc[df['time'] <= df.pop('_max_time'), :].reset_index(drop=True) return df
def simulate_daily_series(num_groups: int, num_timesteps: int, noise: float = 1.0) -> 'DataFrame': season_spec = { 'season_start': np.datetime64('2007-01-01'), # arbitrary monday at midnight 'dt_unit': 'D' } # create realistic series: tensor = _simulate(num_groups, num_timesteps, season_spec, noise=noise) # convert to dataset: dataset = TimeSeriesDataset( tensor, group_names=range(num_groups), start_times=[season_spec['season_start']] * num_groups, measures=[['y']], dt_unit=season_spec['dt_unit'] ) # convert to dataframe: df = dataset.to_dataframe() # add predictors: # TODO: meaningful predictors df['X1'] = np.random.normal(size=len(df.index)) df['X2'] = np.random.normal(size=len(df.index)) # make number of timesteps per group non-uniform: max_timestep_per_group = dict(zip( range(num_groups), np.random.choice(range(int(num_timesteps * .80), num_timesteps), size=num_groups, replace=True) )) df['_max_time'] = season_spec['season_start'] + df['group'].map(max_timestep_per_group) df = df.loc[df['time'] <= df.pop('_max_time'), :].reset_index(drop=True) return df
def test_last_measured_idx(self): tens = torch.zeros((3, 10, 2)) # first group 4 tens[0, 5:, :] = float('nan') # second group 7: tens[1, 5:, 0] = float('nan') tens[1, 8:, 1] = float('nan') # third group end: tens[2, 8, :] = float('nan') d = TimeSeriesDataset(tens, group_names=range(3), start_times=[0] * 3, measures=[['x', 'y']], dt_unit=None) last_measured = d._last_measured_idx() self.assertEqual(last_measured[0], 4) self.assertEqual(last_measured[1], 7) self.assertEqual(last_measured[2], 9)
def _tensor_to_df(tens, measures): times = batch_info.get('times', batch_info['start_times'][:, None] + np.arange(0, tens.shape[1])) return TimeSeriesDataset.tensor_to_dataframe( tensor=tens, times=times, group_names=batch_info['group_names'], group_colname=group_colname, time_colname=time_colname, measures=measures )
def _tensor_to_df(tens, measures): times = dt_helper.make_grid(batch_info['start_times'], tens.shape[1]) return TimeSeriesDataset.tensor_to_dataframe( tensor=tens, times=times, group_names=batch_info['group_names'], group_colname=group_colname, time_colname=time_colname, measures=measures)
def _tensor_to_df(tens, measures): offsets = np.arange(0, tens.shape[1]) * ( batch_info['dt_unit'] if batch_info['dt_unit'] else 1) times = batch_info['start_times'][:, None] + offsets return TimeSeriesDataset.tensor_to_dataframe( tensor=tens, times=times, group_names=batch_info['group_names'], group_colname=group_colname, time_colname=time_colname, measures=measures)
def test_time_series_dataset(self): values = torch.randn((3, 39, 2)) batch = TimeSeriesDataset( values, group_names=['one', 'two', 'three'], start_times=[0, 0, 0], measures=[['y1', 'y2']], dt_unit=None ) try: import pandas as pd except ImportError: warn("Not testing TimeSeriesDataset.to_dataframe, pandas not installed.") return df1 = batch.to_dataframe() df2 = pd.concat([ pd.DataFrame(values[i].numpy(), columns=batch.all_measures).assign(group=group, time=batch.times()[0]) for i, group in enumerate(batch.group_names) ]) self.assertTrue((df1 == df2).all().all())
def test_equations_preds(self, n_step: int): from torch_kalman.utils.data import TimeSeriesDataset from pandas import DataFrame class LinearModelFixed(LinearModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.no_icov_state_elements = self.state_elements kf = KalmanFilter( processes=[LinearModelFixed(id='lm', predictors=['x1', 'x2'])], measures=['y'], compiled=False) kf.script_module._scale_by_measure_var = False kf.state_dict( )['script_module.processes.lm.init_mean'][:] = torch.tensor( [1.5, -0.5]) kf.state_dict( )['script_module.measure_covariance.cholesky_log_diag'][0] = np.log( .1**.5) num_times = 100 df = DataFrame({ 'x1': np.random.randn(num_times), 'x2': np.random.randn(num_times) }) df['y'] = 1.5 * df['x1'] + -.5 * df['x2'] + .1 * np.random.randn( num_times) df['time'] = df.index.values df['group'] = '1' dataset = TimeSeriesDataset.from_dataframe(dataframe=df, group_colname='group', time_colname='time', dt_unit=None, X_colnames=['x1', 'x2'], y_colnames=['y']) y, X = dataset.tensors from pandas import Series pred = kf(y, X=X, out_timesteps=X.shape[1], n_step=n_step) y_series = Series(y.squeeze().numpy()) for shift in range(-2, 3): resid = y_series.shift(shift) - Series( pred.means.squeeze().numpy()) if shift: # check there's no misalignment in internal n_step logic (i.e., realigning the input makes things worse) self.assertGreater((resid**2).mean(), 1.) else: self.assertLess((resid**2).mean(), .02)
# + {"hidePrompt": true, "cell_type": "markdown"} # #### Prepare our Dataset # # One of the key advantages of `torch-kalman` is the ability to train on a batch of time-serieses, instead of training a separate model for each individually. The `TimeSeriesDataset` is similar to PyTorch's native `TensorDataset`, with some useful metadata on the batch of time-serieses (the station names, the dates for each). # + # preprocess our measures of interest: measures = ['SO2', 'PM10'] measures_pp = [m + '_log10_scaled' for m in measures] df_aq_weekly[measures_pp] = np.log10(df_aq_weekly[measures] / col_means[measures]) # create a dataset: dataset_all = TimeSeriesDataset.from_dataframe(dataframe=df_aq_weekly, dt_unit='W', measure_colnames=measures_pp, group_colname='station', time_colname='date') # Train/Val split: dataset_train, dataset_val = dataset_all.train_val_split(dt=SPLIT_DT) dataset_train, dataset_val # - # #### Specify our Model # # The `KalmanFilter` subclasses `torch.nn.Module`. We specify the model by passing `processes` that capture the behaviors of our `measures`. processes = [] for measure in measures_pp: processes.extend([
def test_training3(self): """ Test TBATS and TimeSeriesDataset integration """ try: import pandas as pd except ImportError: # not a package requirement return torch.manual_seed(123) df = pd.DataFrame({ 'sin': np.sin(2. * 3.1415 * np.arange(0., 5 * 7.) / 7.), 'cos': np.cos(2. * 3.1415 * np.arange(0., 5 * 7.) / 7.) }) df['y'] = df['cos'].where(df.index < 12, other=df['sin']) df = pd.concat([ df.assign(observed=lambda df: df['y'] + np.random.normal( scale=.2, size=len(df.index)), group=str(i + 1), time=lambda df: np.array(df.index.tolist(), dtype='datetime64[D]') + np. random.randint(low=0, high=4)) for i in range(10) ]) dataset = TimeSeriesDataset.from_dataframe(df, group_colname='group', time_colname='time', dt_unit='D', measure_colnames=['y']) def _train(num_epochs: int = 15): kf = KalmanFilter(processes=[ TBATS(id='day_of_week', period=7, dt_unit='D', K=1, process_variance=True, decay=(.85, 1.)) ], measures=['y']) # train: optimizer = torch.optim.LBFGS(kf.parameters(), lr=.15, max_iter=10) def closure(): optimizer.zero_grad() with warnings.catch_warnings(): warnings.simplefilter("ignore") pred = kf(dataset.tensors[0], start_datetimes=dataset.start_datetimes) loss = -pred.log_prob(dataset.tensors[0]).mean() loss.backward() return loss print(f"\nTraining for {num_epochs} epochs...") for i in range(num_epochs): loss = optimizer.step(closure) print("loss:", loss.item()) return kf kf = None for i in range(MAX_TRIES): try: kf = _train() except RuntimeError as e: if 'cholesky' not in str(e): raise e if kf is not None: break if kf is None: raise RuntimeError("MAX_TRIES exceeded") with torch.no_grad(): pred = kf(dataset.tensors[0], start_datetimes=dataset.start_datetimes) df_pred = pred.to_dataframe(dataset) self.assertLess(np.mean((df_pred['actual'] - df_pred['mean'])**2), .05)