def fit(self, inputs, targets, num_epochs, test_dataset=None): streaming_state = self.gp.streaming self.gp.set_streaming(False) dataset = torch.utils.data.TensorDataset(inputs, targets) dataloader = torch.utils.data.DataLoader(dataset, batch_size=1024, shuffle=True) num_batches = len(dataloader) self.mll = VariationalELBO(self.gp.likelihood, self.gp, num_data=len(dataset), beta=1.0) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, num_epochs, 1e-4) records = [] for epoch in range(num_epochs): self.train() avg_loss = 0 for input_batch, target_batch in dataloader: self.optimizer.zero_grad() train_dist = self(input_batch) target_batch = self._reshape_targets(target_batch) loss = -self.mll(train_dist, target_batch).mean() loss.backward() self.optimizer.step() avg_loss += loss.item() / num_batches lr_scheduler.step() rmse = nll = float('NaN') if test_dataset is not None: test_x, test_y = test_dataset[:] with torch.no_grad(): rmse, nll = self.evaluate(test_x, test_y) records.append(dict(train_loss=avg_loss, test_rmse=rmse, test_nll=nll, noise=self.gp.likelihood.noise.mean().item(), epoch=epoch + 1)) self.gp.set_streaming(streaming_state) self.eval() return records
def _update_hyperparameters(self): # Find optimal model hyperparameters self.train() self.likelihood.train() # Use the adam optimizer optimizer = Adam(self.parameters(), lr=0.1) # "Loss" for GPs - the marginal log likelihood # num_data refers to the number of training datapoints mll = VariationalELBO(self.likelihood, self, self.train_targets.numel()) training_iterations = 50 for i in range(training_iterations): # Zero backpropped gradients from previous iteration optimizer.zero_grad() # Get predictive output output = self(self.train_inputs[0]) # Calc loss and backprop gradients loss = -mll(output, self.train_targets) loss.backward() # print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item())) optimizer.step()
def __init__(self, likelihood, model, num_data, base_loss): super().__init__() self.mll = DeepApproximateMLL( VariationalELBO(likelihood=likelihood, model=model, num_data=num_data)) self.base_loss = base_loss
def _set_mll(self, num_data, mll_conf): """mllとしてself._mllの指示の元、インスタンスを立てるメソッド """ # mllのインスタンスを立てる if self._mll in variationalelbo: return VariationalELBO( self.likelihood, self.model, num_data=num_data, **mll_conf ) elif self._mll in predictiveloglikelihood: return PredictiveLogLikelihood( self.likelihood, self.model, num_data=num_data, **mll_conf ) elif self._mll in gammarobustvariationalelbo: return GammaRobustVariationalELBO( self.likelihood, self.model, num_data=num_data, **mll_conf ) else: raise ValueError(f'mll={self._mll}は用意されていません')
def setUp(self): super().setUp() train_X = torch.rand(10, 1, device=self.device) train_y = torch.sin(train_X) + torch.randn_like(train_X) * 0.2 self.model = SingleTaskVariationalGP( train_X=train_X, likelihood=GaussianLikelihood() ).to(self.device) mll = VariationalELBO(self.model.likelihood, self.model.model, num_data=10) loss = -mll(self.model.likelihood(self.model(train_X)), train_y).sum() loss.backward()
def train(model, train_loader, n_iter=50): num_data = train_loader.dataset.X.shape[0] model.train() optimizer = torch.optim.Adam(model.parameters(), lr=0.1) elbo = DeepApproximateMLL( VariationalELBO(model.likelihood, model, num_data=num_data)) for i in range(n_iter): for x, y in train_loader: if torch.cuda.is_available(): x, y = x.cuda(), y.cuda() optimizer.zero_grad() output = model(x) loss = -elbo(output, y) loss.backward() optimizer.step() print("Iter={}\t Loss:{:.3f}".format(i, loss.item()))
def optimize(self, X, y, epochs=500, lr=0.01, samples=10): opt = torch.optim.Adam([{'params': self.parameters()}], lr=lr) mll = DeepApproximateMLL( VariationalELBO(self.likelihood, self, X.shape[-2])) self.train() self.likelihood.train() lls = [] gpytorch.settings.skip_posterior_variances(state=False) for i in range(epochs): with gpytorch.settings.num_likelihood_samples(samples): opt.zero_grad() output = self(X) loss = -mll(output, y) loss.backward() noise = self.likelihood.noise.item() print('Iter: %d/%d, Loss: %.5f, Likelihood noise: %.3f' % (i + 1, epochs, loss.item(), noise)) lls.append(loss.item()) opt.step() return lls
def initialize_model(X, Y, old_model=None, **kwargs): if old_model is None: if args.dim == 3: wiski_grid_size = 10 elif args.dim == 2: wiski_grid_size = 30 grid_list = create_grid([wiski_grid_size] * args.dim, grid_bounds=bounds) inducing_points = ( torch.stack([x.reshape(-1) for x in torch.meshgrid(grid_list)]) .t() .contiguous() .clone() ) likelihood = GaussianLikelihood() model_base = VariationalGPModel( inducing_points, likelihood=likelihood, beta=1.0, learn_inducing_locations=True, ) model_obj = ApproximateGPyTorchModel( model_base, likelihood, num_outputs=1 ) model_base.train_inputs = [X] model_base.train_targets = Y.view(-1) # we don't implement fixednoiseGaussian likelihoods for the streaming setting if args.fixed_noise: model_obj.likelihood.noise = args.noise ** 2 model_obj.likelihood.requires_grad = False else: model_obj = old_model model_obj.train_inputs = [X] model_obj.train_targets = Y.view(-1) mll = VariationalELBO( model_obj.likelihood, model_obj.model, num_data=X.shape[-2] ) return model_obj, mll
def _initialize_models(self, ds_train): """ Function to initialize the feature extractor, GP, and the optimizer before training. """ # Initialize Feature Extractor (Residual Net) self.feature_extractor = FCResNet(input_dim=self.input_dim, features=self.features, depth=self.depth, spectral_normalization=True, coeff=self.coeff, n_power_iterations=FC_N_POWER_ITERATIONS, dropout_rate=FC_DROPOUT_RATE, ) initial_inducing_points, initial_lengthscale = initial_values_for_GP( ds_train, self.feature_extractor, self.n_inducing_points ) # Initialize Gaussian Process gp = GP( num_outputs=self.num_outputs, initial_lengthscale=initial_lengthscale, initial_inducing_points=initial_inducing_points, kernel=self.kernel, ) # Initialize the overall model Deep Kernel Learning GP self.model = DKL_GP(self.feature_extractor, gp) # Classification task with two classes self.likelihood = SoftmaxLikelihood(num_classes=NUM_OUTPUTS, mixing_weights=False) self.loss_fn = VariationalELBO(self.likelihood, gp, num_data=len(ds_train)) # Initialize models' optimizer parameters = [ {"params": self.model.feature_extractor.parameters(), "lr": self.lr}, {"params": self.model.gp.parameters(), "lr": self.lr}, {"params": self.likelihood.parameters(), "lr": self.lr}, ] self.optimizer = torch.optim.Adam(parameters, weight_decay=OPTIMIZER_WEIGHT_DECAY)
def train(epoch, train_loader, optimizer, likelihood, model, device): model.train() likelihood.train() mll = VariationalELBO(likelihood, model.gp_layer, num_data=len(train_loader.dataset)) train_loss = 0. for idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = -mll(output, target) loss.backward() optimizer.step() if (idx + 1) % 25 == 0: current_loss = loss.item() print( f'Epoch: {epoch} [{idx+1}/{len(train_loader)}], Loss: {current_loss:.6f}' )
def fit(self, inputs, targets, num_epochs, test_dataset=None): streaming_state = self.gp.streaming self.gp.set_streaming(False) dataset = torch.utils.data.TensorDataset(inputs, targets) dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True) self.mll = VariationalELBO(self.gp.likelihood, self.gp, num_data=len(dataset), beta=1.0) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.optimizer, num_epochs, 1e-4) records = [] num_batches = len(dataloader) for epoch in range(num_epochs): self.train() avg_loss = 0 for input_batch, target_batch in dataloader: self.optimizer.zero_grad() features = self.stem(input_batch) train_dist = self.gp(features) loss = -self.mll(train_dist, target_batch) loss.backward() self.optimizer.step() avg_loss += loss.item() / num_batches lr_scheduler.step() test_acc = float('NaN') if test_dataset is not None: test_x, test_y = test_dataset[:] with torch.no_grad(): test_pred = self.predict(test_x) test_acc = test_pred.eq(test_y).float().mean().item() records.append( dict(train_loss=avg_loss, test_acc=test_acc, epoch=epoch + 1)) self.gp.set_streaming(streaming_state) self.eval() return records
def test_variational_setUp(self): for dtype in [torch.float, torch.double]: train_X = torch.rand(10, 1, device=self.device, dtype=dtype) train_y = torch.randn(10, 3, device=self.device, dtype=dtype) for ty, num_out in [[train_y, 3], [train_y, 1], [None, 3]]: batched_model = SingleTaskVariationalGP( train_X, train_Y=ty, num_outputs=num_out, learn_inducing_points=False, ).to(self.device) mll = VariationalELBO( batched_model.likelihood, batched_model.model, num_data=10 ) with torch.enable_grad(): loss = -mll( batched_model.likelihood(batched_model(train_X)), train_y ).sum() loss.backward() # ensure that inducing points do not require grad model_var_strat = batched_model.model.variational_strategy self.assertEqual( model_var_strat.base_variational_strategy.inducing_points.grad, None, ) # but that the covariance does have a gradient self.assertIsNotNone( batched_model.model.covar_module.raw_outputscale.grad ) # check that we always have three outputs self.assertEqual(batched_model._num_outputs, 3) self.assertIsInstance( batched_model.likelihood, MultitaskGaussianLikelihood )
def update(self, inputs, targets, update_stem=True): if self.gp.streaming: self.gp.register_streaming_loss() inputs = inputs.view(-1, self.stem.input_dim) targets = targets.view(-1, self.target_dim) self.mll = VariationalELBO(self.gp.likelihood, self.gp, num_data=inputs.size(0), beta=self._prior_beta) self.train() for _ in range(self.num_update_steps): self.optimizer.zero_grad() features = self._get_features(inputs) features = features if update_stem else features.detach() train_dist = self.gp(features) targets = self._reshape_targets(targets) loss = -self.mll(train_dist, targets).mean() loss.backward() self.optimizer.step() self.eval() self._raw_inputs = [torch.cat([*self._raw_inputs, inputs])] stem_loss = gp_loss = loss.item() return stem_loss, gp_loss
def fit(self, train_data, holdout_data, objective='elbo', max_epochs=None, normalize=True, early_stopping=False, pretrain=False, reinit_inducing_loc=False, verbose=False, max_steps=None, **kwargs): """ Train the model on `dataset` by maximizing either the `VariationalELBO` or `PredictiveLogLikelihood` objective. Args: train_data (tuple of np.array objects) holdout_data (tuple of np.array objects) objective (str): "pll" or "elbo" max_epochs (int): max number of epochs to train normalize (bool): if True, z-score inputs and targets early_stopping (bool): If True, use holdout loss as convergence criterion. Requires holdout_ratio > 0. pretrain (bool): If True, pretrain the feature extractor with the MSE objective. Requires self.feature_dim == self.label_dim. reinit_inducing_loc (bool): If True, initialize inducing points with k-means. max_steps (int) verbose (bool) Return: metrics (dict) """ train_data = torch.utils.data.TensorDataset( torch.tensor(train_data[0], dtype=torch.get_default_dtype()), torch.tensor(train_data[1], dtype=torch.get_default_dtype())) holdout_data = torch.utils.data.TensorDataset( torch.tensor(holdout_data[0], dtype=torch.get_default_dtype()), torch.tensor(holdout_data[1], dtype=torch.get_default_dtype())) if objective == 'elbo': obj_fn = VariationalELBO(self.likelihood, self, num_data=len(train_data)) elif objective == 'pll': obj_fn = PredictiveLogLikelihood(self.likelihood, self, num_data=len(train_data), beta=1e-3) else: raise RuntimeError("unrecognized model objective") if holdout_data and early_stopping: val_x, val_y = holdout_data[:] eval_loss, eval_mse = self._get_val_metrics( obj_fn, torch.nn.MSELoss(), val_x, val_y) if eval_loss != eval_loss or not early_stopping: snapshot_loss = 1e6 else: snapshot_loss = eval_loss snapshot = (0, snapshot_loss) if verbose: print( f"[ SVGP ] initial holdout loss: {eval_loss:.4f}, MSE: {eval_mse:.4f}" ) self.load_state_dict(self._train_ckpt) if normalize: train_inputs, train_labels = train_data[:] self.input_mean, self.input_std = train_inputs.mean( 0), train_inputs.std(0) self.label_mean, self.label_std = train_labels.mean( 0), train_labels.std(0) train_data = TensorDataset(train_inputs, (train_labels - self.label_mean) / self.label_std) if pretrain: if self.feature_dim == self.label_dim: if verbose: print("[ SVGP ] pretraining feature extractor") self.nn.fit( dataset=train_data, holdout_ratio=0., early_stopping=False, ) else: raise RuntimeError( "features and labels must be the same size to pretrain") if reinit_inducing_loc: if verbose: print( "[ SVGP ] initializing inducing point locations w/ k-means" ) train_inputs, _ = train_data[:] self.set_inducing_loc(train_inputs) if verbose: print( f"[ SVGP ] training w/ objective {objective} on {len(train_data)} examples" ) optimizer = Adam(self.optim_param_groups) if reinit_inducing_loc: temp = self.max_epochs_since_update self.max_epochs_since_update = 8 loop_metrics, snapshot = self._training_loop( train_data, holdout_data, optimizer, obj_fn, snapshot, max_epochs, early_stopping, max_steps, ) metrics = loop_metrics self.max_epochs_since_update = temp if verbose: print("[ SVGP ] dropping learning rate") for group in optimizer.param_groups: group['lr'] /= 10 loop_metrics, snapshot = self._training_loop(train_data, holdout_data, optimizer, obj_fn, snapshot, max_epochs, early_stopping, max_steps) if reinit_inducing_loc: for key in metrics.keys(): metrics[key] += (loop_metrics[key]) else: metrics = loop_metrics self._train_ckpt = deepcopy(self.state_dict()) self.load_state_dict(self._eval_ckpt) self.train() # TODO investigate GPyTorch load_state_dict bug eval_loss, eval_mse = self._get_val_metrics(obj_fn, torch.nn.MSELoss(), val_x, val_y) metrics['holdout_mse'] = eval_mse metrics['holdout_loss'] = eval_loss if verbose: print( f"[ SVGP ] holdout loss: {metrics['val_loss'][-1]:.4f}, MSE: {metrics['val_mse'][-1]:.4f}" ) print(f"[ SVGP ] loading snapshot from epoch {snapshot[0]}") print( f"[ SVGP ] final holdout loss: {eval_loss:.4f}, MSE: {eval_mse:.4f}" ) self.eval() return metrics
# Setting shapes N = len(Y) data_dim = Y.shape[1] latent_dim = 12 n_inducing = 25 pca = True # Model model = My_GPLVM_Model(N, data_dim, latent_dim, n_inducing, pca=pca) # Likelihood likelihood = GaussianLikelihood(batch_shape=model.batch_shape) # Declaring objective to be optimised along with optimiser mll = VariationalELBO(likelihood, model, num_data=len(Y)) optimizer = torch.optim.Adam([{ 'params': model.parameters() }, { 'params': likelihood.parameters() }], lr=0.01) # Training loop - optimises the objective wrt kernel hypers, variational params and inducing inputs # using the optimizer provided. loss_list = [] iterator = trange(10000, leave=True) batch_size = 100 for i in iterator:
model = DeepGaussianProcess(x_train_shape=x_train.shape) if torch.cuda.is_available(): model = model.cuda() # Because deep GPs use some amounts of internal sampling (even in the stochastic variational setting), # we need to handle the objective function (e.g. the ELBO) in a slightly different way. num_samples = 10 optimizer = torch.optim.Adam([{"params": model.parameters()}], lr=0.1) ''' DeepApproximateMLL only adds the elbo losses of each layer! ''' marginal_loglikelihood = DeepApproximateMLL( VariationalELBO(model.likelihood, model, x_train.shape[-2])) n_epochs = 100 for i in range(n_epochs): for x_batch, y_batch in train_loader: with num_likelihood_samples(num_samples): optimizer.zero_grad() output = model(x_batch) loss = -marginal_loglikelihood(output, y_batch) loss.backward() optimizer.step() print(f"epochs {i}, loss {loss.item()}") ## test and evaluate the model
def main(): # Seed the random number generators np.random.seed(0) torch.manual_seed(0) # Create some toy data n = 500 x = np.sort(np.random.uniform(0, 1, n)) f = true_f(x) y = scipy.stats.bernoulli.rvs(scipy.special.expit(f)) ## Uncomment to show raw data # plt.scatter(x, y, alpha=0.5) # plt.xlabel('$x$') # plt.ylabel('$y$') # plt.yticks([0, 1]) # plt.show() ## Uncomment to show logits ("f") # fig, ax = plt.subplots() # x_plot = np.linspace(0, 1, 100) # ax.plot(x_plot, true_f(x_plot), alpha=0.5) # ax.scatter(x, f, alpha=0.5) # plt.show() train_x = torch.from_numpy(x.astype(np.float32)) train_y = torch.from_numpy(y.astype(np.float32)) # Set initial inducing points inducing_points = torch.rand(50) # Initialize model and likelihood model = GPClassificationModel(inducing_points=inducing_points) likelihood = BernoulliLikelihood() # Set number of epochs training_iter = 1000 # Use the adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # num_data refers to the number of training datapoints mll = VariationalELBO(likelihood, model, train_y.numel()) iterator = tqdm(range(training_iter)) for _ in iterator: # Zero backpropped gradients from previous iteration optimizer.zero_grad() # Get predictive output output = model(train_x) # Calc loss and backprop gradients loss = -mll(output, train_y) loss.backward() optimizer.step() iterator.set_postfix(loss=loss.item()) # Show results test_x = torch.linspace(0, 1, 101) f_preds = model(test_x) pred = f_preds.sample(torch.Size((1000,))).numpy() fig, ax = plt.subplots() plot_gp_dist(ax, pred, test_x) ax.plot(test_x, true_f(test_x), alpha=0.5) plt.show()
def fit(self, inputs, targets, covariances=None, n_samples=5000, max_iter=10000, learning_rate=1e-3, rtol=1e-4, ntol=100, auto=True, verbose=True): ''' Optimises the hyperparameters of the GP kernel and likelihood. inputs: (nx2) numpy array targets: (n,) numpy array n_samples: number of samples to take from the inputs/targets at every optimisation epoch max_iter: maximum number of optimisation epochs learning_rate: optimiser step size rtol: change between -MLL values over ntol epoch that determine termination if auto==True ntol: number of epochs required to maintain rtol in order to terminate if auto==True auto: if True terminate based on rtol and ntol, else terminate at max_iter verbose: if True show progress bar, else nothing ''' # sanity assert inputs.shape[0] == targets.shape[0] assert inputs.shape[1] == 2 if covariances is not None: assert covariances.shape[0] == inputs.shape[0] assert covariances.shape[1] == covariances.shape[2] == 2 # inducing points randomly distributed over data indpts = np.random.choice(inputs.shape[0], self.m, replace=True) self.variational_strategy.inducing_points.data = torch.from_numpy( inputs[indpts]).to(self.device).float() # number of random samples n = inputs.shape[0] n = n_samples if n >= n_samples else n # objective mll = VariationalELBO(self.likelihood, self, n, combine_terms=True) # stochastic optimiser opt = torch.optim.Adam(self.parameters(), lr=learning_rate) # convergence criterion if auto: criterion = ExpMAStoppingCriterion(rel_tol=rtol, n_window=ntol) # episode iteratior epochs = range(max_iter) epochs = tqdm.tqdm(epochs) if verbose else epochs # train self.train() self.likelihood.train() for _ in epochs: # randomly sample from the dataset idx = np.random.choice(inputs.shape[0], n, replace=False) input = torch.from_numpy(inputs[idx]).to(self.device).float() # if the inputs are distributional, sample them if covariances is not None: covariance = torch.from_numpy(covariances[idx]).to( self.device).float() input = MultivariateNormal(input, covariance).rsample() # training targets target = torch.from_numpy(targets[idx]).to(self.device).float() # compute loss, compute gradient, and update loss = -mll(self(input), target) opt.zero_grad() loss.backward() opt.step() # verbosity and convergence check if verbose: epochs.set_description('Loss {:.4f}'.format(loss.item())) if auto and criterion.evaluate(loss.detach()): break
if torch.cuda.is_available(): model = model.cuda() likelihood = likelihood.cuda() n_epochs = 5 lr = 0.1 optimizer = SGD([{'params': model.feature_extractor.parameters(), 'weight_decay': 1e-4}, {'params': model.gp_layer.hyperparameters(), 'lr': lr * 0.01}, {'params': model.variational_parameters()}, {'params': likelihood.parameters()}], lr=lr, momentum=0.9, nesterov=True, weight_decay=0) scheduler = MultiStepLR(optimizer, milestones=[0.5 * n_epochs, 0.75 * n_epochs], gamma=0.1) mll = VariationalELBO(likelihood, model.gp_layer, num_data=len(train_loader.dataset)) def train(epoch): model.train() likelihood.train() minibatch_iter = tqdm(train_loader, desc=f"(Epoch {epoch}) Minibatch") with num_likelihood_samples(8): for data, target in minibatch_iter: if torch.cuda.is_available(): data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = -mll(output, target) loss.backward()
def main(args): if args.cuda and torch.cuda.is_available(): device = torch.device("cuda:0") else: device = torch.device("cpu") init_dict, train_dict, test_dict = prepare_data(args.data_loc, args.num_init, args.num_total, test_is_year=False, seed=args.seed) init_x, init_y, init_y_var = ( init_dict["x"].to(device), init_dict["y"].to(device), init_dict["y_var"].to(device), ) train_x, train_y, train_y_var = ( train_dict["x"].to(device), train_dict["y"].to(device), train_dict["y_var"].to(device), ) test_x, test_y, test_y_var = ( test_dict["x"].to(device), test_dict["y"].to(device), test_dict["y_var"].to(device), ) likelihood = FixedNoiseGaussianLikelihood(noise=init_y_var) grid_pts = create_grid(grid_sizes=[30, 30], grid_bounds=torch.tensor([[0., 1.], [0., 1.]])) induc_points = torch.cat( [x.reshape(-1, 1) for x in torch.meshgrid(grid_pts)], dim=-1) model = VariationalGPModel( inducing_points=induc_points, mean_module=gpytorch.means.ZeroMean(), covar_module=ScaleKernel( MaternKernel( ard_num_dims=2, nu=0.5, lengthscale_prior=GammaPrior(3.0, 6.0), ), outputscale_prior=GammaPrior(2.0, 0.15), ), streaming=True, likelihood=likelihood, beta=args.beta, learn_inducing_locations=args.learn_inducing, ).to(device) mll = VariationalELBO(model.likelihood, model, beta=args.beta, num_data=args.num_init) print("---- Fitting initial model ----") start = time.time() model.train() model.zero_grad() optimizer = torch.optim.Adam(model.parameters(), lr=10 * args.lr_init) model, loss = fit_variational_model(mll, model, optimizer, init_x, init_y, maxiter=1000) end = time.time() print("Elapsed fitting time: ", end - start) print("--- Now computing initial RMSE") model.eval() with gpytorch.settings.skip_posterior_variances(True): test_pred = model(test_x) pred_rmse = ((test_pred.mean - test_y)**2).mean().sqrt() print("---- Initial RMSE: ", pred_rmse.item()) all_outputs = [] start_ind = init_x.shape[0] end_ind = int(start_ind + args.batch_size) current_x = init_x current_y = init_y current_y_var = init_y_var for step in range(args.num_steps): if step > 0 and step % 25 == 0: print("Beginning step ", step) total_time_step_start = time.time() if step > 0: print("---- Fitting model ----") start = time.time() model.train() model.zero_grad() model.likelihood = FixedNoiseGaussianLikelihood(current_y_var) mll = VariationalELBO(model.likelihood, model, beta=args.beta, num_data=args.num_init) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_init * 0.99**step) model, loss = fit_variational_model(mll, model, optimizer, current_x, current_y, maxiter=300) model.zero_grad() end = time.time() print("Elapsed fitting time: ", end - start) # print("Named parameters: ", list(model.named_parameters())) if args.acqf == "max_post_var" and not args.random: candidates, acq_value = generate_candidates(model, args.batch_size, device, maxiter=300) elif args.acqf == "max_test_var" and not args.random: model.eval() vals, inds = model(test_x).variance.sort() acq_value = vals[-args.batch_size:].mean().detach() candidates = test_x[inds[-args.batch_size:]] else: candidates = torch.rand(args.batch_size, train_x.shape[-1], device=device, dtype=train_x.dtype) acq_value = torch.zeros(1) model.eval() _ = model(test_x[:10]) # to init caches print("---- Finished optimizing; now querying dataset ---- ") with torch.no_grad(): covar_dists = model.covar_module(candidates, train_x) nearest_points = covar_dists.evaluate().argmax(dim=-1) new_x = train_x[nearest_points] new_y = train_y[nearest_points] new_y_var = train_y_var[nearest_points] todrop = torch.tensor( [x in nearest_points for x in range(train_x.shape[0])]) train_x, train_y, train_y_var = train_x[~todrop], train_y[ ~todrop], train_y_var[~todrop] print("New train_x shape", train_x.shape) print("--- Now updating model with simulator ----") current_x = torch.cat((current_x, new_x), dim=0) current_y = torch.cat((current_y, new_y), dim=0) current_y_var = torch.cat((current_y_var, new_y_var), dim=0) print("--- Now computing updated RMSE") model.eval() test_pred = model(test_x) pred_rmse = ((test_pred.mean.view(-1) - test_y.view(-1))**2).mean().sqrt() pred_avg_variance = test_pred.variance.mean() total_time_step_elapsed_time = time.time() - total_time_step_start step_output_list = [ total_time_step_elapsed_time, acq_value.item(), pred_rmse.item(), pred_avg_variance.item(), loss.item() ] print("Step RMSE: ", pred_rmse) all_outputs.append(step_output_list) start_ind = end_ind end_ind = int(end_ind + args.batch_size) output_dict = { "model_state_dict": model.cpu().state_dict(), "queried_points": { 'x': current_x, 'y': current_y }, "results": DataFrame(all_outputs) } torch.save(output_dict, args.output)
NUM_OUTPUT_DIMS = 1 # (the output/input dimension between hidden layer/last layer) # Control shapes so the DGP see correct input dimensions. # x should have shape [num_training_points, dimension] print("x.shape:", x.shape) # [52, 1] print("x.shape[0]:", x.shape[0]) # [52] num_training_points print("x.shape[-1]:", x.shape[-1]) # [1] input dimension print("x.shape[-2]:", x.shape[-2]) # [52] num_training_points # Initialize model with the x.shape, requires data on form [num_training_points, dimension] model = DGP(x.shape) # Set up the optmizer, Adam is generally the best with adaptive learning rate etc. opt = torch.optim.Adam([{'params': model.parameters()}], lr=LEARNING_RATE) # Set up marginal likelihood approximation. Uses the DeepApproximateMLL wrapper. # The VariationalELBO requires to known the number of num_training_points, i.e. 52. mll = DeepApproximateMLL(VariationalELBO(model.likelihood, model, x.shape[-2])) # Set model and likelihood in training mode. For some implementations the 'grad()' # feature must be turned on and off depending on training or evaluation. model.train() model.likelihood.train() lls = [] # to save loss # Train by iteration and updating "weights" and parameters. gpytorch.settings.skip_posterior_variances( state=False) # this is defualt False I think, but just to be sure for i in range(EPOCHS): # Train with certain amount of samples by specificing with SAMPLES, otherwise defualt 10 samples is used. with gpytorch.settings.num_likelihood_samples(SAMPLES): opt.zero_grad(
# Declaring model with initial inducing inputs and latent prior latent_prior = NormalPrior(X_prior_mean, torch.ones_like(X_prior_mean)) model = GPLVM(Y=Y.T, latent_dim=n_latent_dims, n_inducing=n_inducing, X_init=None, pca=True, latent_prior=None, kernel=None, likelihood=None) # Declaring objective to be optimised along with optimiser mll = VariationalELBO(model.likelihood, model, num_data=len(Y.T)) optimizer = torch.optim.Adam([ { 'params': model.parameters() }, ], lr=0.01) # Training loop losses = model.run(mll, optimizer, steps=2000) # Plot result plt.figure(figsize=(8, 6)) colors = plt.get_cmap("tab10").colors[::]
def main(hparams): results_dir = get_results_directory(hparams.output_dir) writer = SummaryWriter(log_dir=str(results_dir)) ds = get_dataset(hparams.dataset, root=hparams.data_root) input_size, num_classes, train_dataset, test_dataset = ds hparams.seed = set_seed(hparams.seed) if hparams.n_inducing_points is None: hparams.n_inducing_points = num_classes print(f"Training with {hparams}") hparams.save(results_dir / "hparams.json") if hparams.ard: # Hardcoded to WRN output size ard = 640 else: ard = None feature_extractor = WideResNet( spectral_normalization=hparams.spectral_normalization, dropout_rate=hparams.dropout_rate, coeff=hparams.coeff, n_power_iterations=hparams.n_power_iterations, batchnorm_momentum=hparams.batchnorm_momentum, ) initial_inducing_points, initial_lengthscale = initial_values_for_GP( train_dataset, feature_extractor, hparams.n_inducing_points ) gp = GP( num_outputs=num_classes, initial_lengthscale=initial_lengthscale, initial_inducing_points=initial_inducing_points, separate_inducing_points=hparams.separate_inducing_points, kernel=hparams.kernel, ard=ard, lengthscale_prior=hparams.lengthscale_prior, ) model = DKL_GP(feature_extractor, gp) model = model.cuda() likelihood = SoftmaxLikelihood(num_classes=num_classes, mixing_weights=False) likelihood = likelihood.cuda() elbo_fn = VariationalELBO(likelihood, gp, num_data=len(train_dataset)) parameters = [ {"params": feature_extractor.parameters(), "lr": hparams.learning_rate}, {"params": gp.parameters(), "lr": hparams.learning_rate}, {"params": likelihood.parameters(), "lr": hparams.learning_rate}, ] optimizer = torch.optim.SGD( parameters, momentum=0.9, weight_decay=hparams.weight_decay ) milestones = [60, 120, 160] scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=0.2 ) def step(engine, batch): model.train() likelihood.train() optimizer.zero_grad() x, y = batch x, y = x.cuda(), y.cuda() y_pred = model(x) elbo = -elbo_fn(y_pred, y) elbo.backward() optimizer.step() return elbo.item() def eval_step(engine, batch): model.eval() likelihood.eval() x, y = batch x, y = x.cuda(), y.cuda() with torch.no_grad(): y_pred = model(x) return y_pred, y trainer = Engine(step) evaluator = Engine(eval_step) metric = Average() metric.attach(trainer, "elbo") def output_transform(output): y_pred, y = output # Sample softmax values independently for classification at test time y_pred = y_pred.to_data_independent_dist() # The mean here is over likelihood samples y_pred = likelihood(y_pred).probs.mean(0) return y_pred, y metric = Accuracy(output_transform=output_transform) metric.attach(evaluator, "accuracy") metric = Loss(lambda y_pred, y: -elbo_fn(y_pred, y)) metric.attach(evaluator, "elbo") kwargs = {"num_workers": 4, "pin_memory": True} train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=hparams.batch_size, shuffle=True, drop_last=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=512, shuffle=False, **kwargs ) @trainer.on(Events.EPOCH_COMPLETED) def log_results(trainer): metrics = trainer.state.metrics elbo = metrics["elbo"] print(f"Train - Epoch: {trainer.state.epoch} ELBO: {elbo:.2f} ") writer.add_scalar("Likelihood/train", elbo, trainer.state.epoch) if hparams.spectral_normalization: for name, layer in model.feature_extractor.named_modules(): if isinstance(layer, torch.nn.Conv2d): writer.add_scalar( f"sigma/{name}", layer.weight_sigma, trainer.state.epoch ) if not hparams.ard: # Otherwise it's too much to submit to tensorboard length_scales = model.gp.covar_module.base_kernel.lengthscale.squeeze() for i in range(length_scales.shape[0]): writer.add_scalar( f"length_scale/{i}", length_scales[i], trainer.state.epoch ) if trainer.state.epoch > 150 and trainer.state.epoch % 5 == 0: _, auroc, aupr = get_ood_metrics( hparams.dataset, "SVHN", model, likelihood, hparams.data_root ) print(f"OoD Metrics - AUROC: {auroc}, AUPR: {aupr}") writer.add_scalar("OoD/auroc", auroc, trainer.state.epoch) writer.add_scalar("OoD/auprc", aupr, trainer.state.epoch) evaluator.run(test_loader) metrics = evaluator.state.metrics acc = metrics["accuracy"] elbo = metrics["elbo"] print( f"Test - Epoch: {trainer.state.epoch} " f"Acc: {acc:.4f} " f"ELBO: {elbo:.2f} " ) writer.add_scalar("Likelihood/test", elbo, trainer.state.epoch) writer.add_scalar("Accuracy/test", acc, trainer.state.epoch) scheduler.step() pbar = ProgressBar(dynamic_ncols=True) pbar.attach(trainer) trainer.run(train_loader, max_epochs=200) # Done training - time to evaluate results = {} evaluator.run(train_loader) train_acc = evaluator.state.metrics["accuracy"] train_elbo = evaluator.state.metrics["elbo"] results["train_accuracy"] = train_acc results["train_elbo"] = train_elbo evaluator.run(test_loader) test_acc = evaluator.state.metrics["accuracy"] test_elbo = evaluator.state.metrics["elbo"] results["test_accuracy"] = test_acc results["test_elbo"] = test_elbo _, auroc, aupr = get_ood_metrics( hparams.dataset, "SVHN", model, likelihood, hparams.data_root ) results["auroc_ood_svhn"] = auroc results["aupr_ood_svhn"] = aupr print(f"Test - Accuracy {results['test_accuracy']:.4f}") results_json = json.dumps(results, indent=4, sort_keys=True) (results_dir / "results.json").write_text(results_json) torch.save(model.state_dict(), results_dir / "model.pt") torch.save(likelihood.state_dict(), results_dir / "likelihood.pt") writer.close()
def fit(self, epochs=75, train_loader=None, save_path=None, val_loader=None): initial_inducing_points, initial_lengthscale = initial_values_for_GP( train_loader.dataset, self.feature_extractor, self.n_inducing_points) self.gp = GP( num_outputs=self.num_classes, initial_lengthscale=initial_lengthscale, initial_inducing_points=initial_inducing_points, separate_inducing_points=self.separate_inducing_points, kernel=self.kernel, ard=self.ard, lengthscale_prior=self.lengthscale_prior, ) self.model = DKL_GP(self.feature_extractor, self.gp) self.model.to(self.device) self.likelihood = SoftmaxLikelihood(num_classes=10, mixing_weights=False) self.likelihood = self.likelihood.to(self.device) self.elbo_fn = VariationalELBO(self.likelihood, self.gp, num_data=len(train_loader.dataset)) parameters = [ { "params": self.feature_extractor.parameters(), "lr": self.learning_rate }, { "params": self.gp.parameters(), "lr": self.learning_rate }, { "params": self.likelihood.parameters(), "lr": self.learning_rate }, ] self.optimizer = torch.optim.SGD(parameters, momentum=0.9, weight_decay=self.weight_decay) self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=[25, 50, 75], gamma=0.2) self.model.train() for epoch in tqdm(range(epochs)): running_loss = 0 for i, (x, y) in enumerate(train_loader): self.model.train() self.optimizer.zero_grad() x, y = x.to(self.device), y.to(self.device) y_pred = self.model(x) elbo = -self.elbo_fn(y_pred, y) running_loss += elbo.item() elbo.backward() self.optimizer.step() if i % 50 == 0: print("Iteration: {}, Loss = {}".format( i, running_loss / (i + 1))) if epoch % 1 == 0 and val_loader is not None: self.model.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(val_loader): inputs, y = inputs.to(self.device), F.one_hot( targets, self.num_classes).float().to(self.device) y_pred = self.model(data).to_data_independent_dist() output = self.likelihood(y_pred).probs.mean(0) predicted = torch.argmax(output, dim=1) loss = -self.likelihood.expected_log_prob( y, y_pred).mean() test_loss += loss.item() targets = targets.to(self.device) total += targets.size(0) correct += predicted.eq(targets.to( self.device)).sum().item() acc = 100. * correct / total print("Epoch: {}, test acc: {}, test loss {}".format( epoch, acc, test_loss / total)) self.scheduler.step() if save_path is not None: self.save(save_path)