def kl_divergence(self): """Get the KL divergence.""" variational_dist_u = self.variational_distribution.variational_distribution prior_dist = self.prior_distribution with settings.max_preconditioner_size(0): kl_divergence = torch.distributions.kl.kl_divergence( variational_dist_u, prior_dist) return kl_divergence
def predict(self, input): input = transform(input.reshape((-1, self.input_size)), self.input_trans) with max_preconditioner_size(10), torch.no_grad(): with max_root_decomposition_size(30), fast_pred_var(): output = self.likelihood(self.model(input)).mean output = inverse_transform(output, self.target_trans) if self.incremental: return input[..., :self.target_size] + output else: return output
def predict(self, input): self.device = torch.device('cpu') self.model.eval().to(self.device) self.likelihood.eval().to(self.device) input = transform(torch.reshape(input, (-1, self.input_size)), self.input_trans) with max_preconditioner_size(10), torch.no_grad(): with max_root_decomposition_size(30), fast_pred_var(): output = self.likelihood(self.model(input)).mean output = inverse_transform(output[:, None], self.target_trans).squeeze() return output
def predict(self, input): self.device = torch.device('cpu') self.model.eval().to(self.device) self.likelihood.eval().to(self.device) input = transform(input.reshape((-1, self.input_size)), self.input_trans) with max_preconditioner_size(10), torch.no_grad(): with max_root_decomposition_size(30), fast_pred_var(): _input = [input for _ in range(self.target_size)] predictions = self.likelihood(*self.model(*_input)) output = torch.stack([_pred.mean for _pred in predictions]).T output = inverse_transform(output, self.target_trans).squeeze() return output
def forward(self, x): """Forward propagate the module. This method determines how to marginalize out the inducing function values. Specifically, forward defines how to transform a variational distribution over the inducing point values, q(u), in to a variational distribution over the function values at specified locations x, q(f|x), by integrating p(f|x, u)q(u)du Parameters ---------- x (torch.tensor): Locations x to get the variational posterior of the function values at. Returns ------- The distribution q(f|x) """ variational_dist = self.variational_distribution.approx_variational_distribution inducing_points = self.inducing_points inducing_batch_shape = inducing_points.shape[:-2] if inducing_batch_shape < x.shape[:-2] or len( inducing_batch_shape) < len(x.shape[:-2]): batch_shape = _mul_broadcast_shape(inducing_points.shape[:-2], x.shape[:-2]) inducing_points = inducing_points.expand( *batch_shape, *inducing_points.shape[-2:]) x = x.expand(*batch_shape, *x.shape[-2:]) variational_dist = variational_dist.expand(batch_shape) # If our points equal the inducing points, we're done if torch.equal(x, inducing_points): return variational_dist # Otherwise, we have to marginalize else: num_induc = inducing_points.size(-2) full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix # Mean terms test_mean = full_mean[..., num_induc:] induc_mean = full_mean[..., :num_induc] mean_diff = (variational_dist.mean - induc_mean).unsqueeze(-1) # Covariance terms induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] aux = variational_dist.lazy_covariance_matrix.root_decomposition() root_variational_covar = aux.root.evaluate() # If we had to expand the inducing points, # shrink the inducing mean and induc_induc_covar dimension # This makes everything more computationally efficient if len(inducing_batch_shape) < len(induc_induc_covar.batch_shape): index = tuple(0 for _ in range( len(induc_induc_covar.batch_shape) - len(inducing_batch_shape))) repeat_size = torch.Size( (tuple(induc_induc_covar.batch_shape[:len(index)]) + tuple( 1 for _ in induc_induc_covar.batch_shape[len(index):]))) induc_induc_covar = BatchRepeatLazyTensor( induc_induc_covar.__getitem__(index), repeat_size) # If we're less than a certain size, we'll compute the Cholesky # decomposition of induc_induc_covar cholesky = False if settings.fast_computations.log_prob.off() or ( num_induc <= settings.max_cholesky_size.value()): induc_induc_covar = CholLazyTensor( induc_induc_covar.cholesky()) cholesky = True # If we are making predictions and don't need variances, we can do things # very quickly. if not self.training and settings.skip_posterior_variances.on(): if not hasattr(self, "_mean_cache"): self._mean_cache = induc_induc_covar.inv_matmul( mean_diff).detach() predictive_mean = torch.add( test_mean, induc_data_covar.transpose(-2, -1).matmul( self._mean_cache).squeeze(-1)) predictive_covar = ZeroLazyTensor(test_mean.size(-1), test_mean.size(-1)) return MultivariateNormal(predictive_mean, predictive_covar) # Cache the CG results # For now: run variational inference without a preconditioner # The preconditioner screws things up for some reason with settings.max_preconditioner_size(0): # Cache the CG results left_tensors = torch.cat([mean_diff, root_variational_covar], -1) with torch.no_grad(): eager_rhs = torch.cat([left_tensors, induc_data_covar], -1) solve, probe_vecs, probe_vec_norms, probe_vec_solves, tmats = \ CachedCGLazyTensor.precompute_terms( induc_induc_covar, eager_rhs.detach(), logdet_terms=(not cholesky), include_tmats=(not settings.skip_logdet_forward.on() and not cholesky) ) eager_rhss = [ eager_rhs.detach(), eager_rhs[..., left_tensors.size(-1):].detach(), eager_rhs[..., :left_tensors.size(-1)].detach() ] solves = [ solve.detach(), solve[..., left_tensors.size(-1):].detach(), solve[..., :left_tensors.size(-1)].detach() ] if settings.skip_logdet_forward.on(): eager_rhss.append( torch.cat([probe_vecs, left_tensors], -1)) solves.append( torch.cat([ probe_vec_solves, solve[..., :left_tensors.size(-1)] ], -1)) induc_induc_covar = CachedCGLazyTensor( induc_induc_covar, eager_rhss=eager_rhss, solves=solves, probe_vectors=probe_vecs, probe_vector_norms=probe_vec_norms, probe_vector_solves=probe_vec_solves, probe_vector_tmats=tmats, ) if self.training: self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) # Compute predictive mean/covariance inv_products = induc_induc_covar.inv_matmul( induc_data_covar, left_tensors.transpose(-1, -2)) predictive_mean = torch.add(test_mean, inv_products[..., 0, :]) predictive_covar = RootLazyTensor(inv_products[..., 1:, :].transpose( -1, -2)) if self.training: interp_data_data_var, _ = induc_induc_covar.inv_quad_logdet( induc_data_covar, logdet=False, reduce_inv_quad=False) data_covariance = DiagLazyTensor( (data_data_covar.diag() - interp_data_data_var).clamp( 0, math.inf)) else: neg_induc_data_data_covar = torch.matmul( induc_data_covar.transpose(-1, -2).mul(-1), induc_induc_covar.inv_matmul(induc_data_covar)) data_covariance = data_data_covar + neg_induc_data_data_covar predictive_covar = PsdSumLazyTensor(predictive_covar, data_covariance) return MultivariateNormal(predictive_mean, predictive_covar)
def main(): parser = argparse.ArgumentParser( description='Deep Kernel Learning with synthetic data.') parser.add_argument('--datapath', type=str, help='Path to data directory.') parser.add_argument('--batchsize', type=int, default=10, help='Batch size.') parser.add_argument('--n_epochs', type=int, default=10, help='Number of epochs.') parser.add_argument('--lr', type=float, default=0.1, help='Path to data directory.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') args = parser.parse_args() torch.manual_seed(args.seed) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") traindata = Synthetic(args.datapath, 'train', download=True) train_loader = DataLoader(traindata, batch_size=args.batchsize) num_classes = len(np.unique(traindata.targets)) testdata = Synthetic(args.datapath, 'test') test_loader = DataLoader(testdata, batch_size=args.batchsize) feature_extractor = ConvFeatureExtractor().to(device) num_features = feature_extractor._filter_sum model = DKLModel(feature_extractor, num_dim=5).to(device) likelihood = SoftmaxLikelihood(num_features=model.num_dim, n_classes=num_classes).to(device) optimizer = SGD([ { 'params': model.feature_extractor.parameters() }, { 'params': model.gp_layer.hyperparameters(), 'lr': args.lr * 0.01 }, { 'params': model.gp_layer.variational_parameters() }, { 'params': likelihood.parameters() }, ], lr=args.lr, momentum=0.9, nesterov=True, weight_decay=0) scheduler = MultiStepLR( optimizer, milestones=[0.5 * args.n_epochs, 0.75 * args.n_epochs], gamma=0.1) for epoch in range(1, args.n_epochs + 1): scheduler.step() with settings.use_toeplitz(False), settings.max_preconditioner_size(0): train(epoch, train_loader, optimizer, likelihood, model, device) test(test_loader, likelihood, model, device) state_dict = model.state_dict() likelihood_state_dict = likelihood.state_dict() torch.save({ 'model': state_dict, 'likelihood': likelihood_state_dict }, 'dkl_synthetic_checkpoint.dat')
for i in range(training_iterations): # Zero backprop gradients optimizer.zero_grad() # Get output from model output = model(x_train) # Calc loss and backprop derivatives loss = -mll(output, y_train) loss.backward() print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item())) optimizer.step() torch.cuda.empty_cache() model.eval() likelihood.eval() x_test = torch.from_numpy(np.linspace(1870, 2030, 200)[:, np.newaxis]) x_test = x_test.cuda() with settings.max_preconditioner_size(10), torch.no_grad(): with settings.max_root_decomposition_size(30), settings.fast_pred_var(): f_preds = model(x_test) y_pred = likelihood(f_preds) # plot with torch.no_grad(): mean = y_pred.mean.cpu().numpy() var = y_pred.variance.cpu().numpy() samples = y_pred.sample().cpu().numpy() plot_gp(mean, var, x_test.cpu().numpy(), X_train=x_train.cpu().numpy(), Y_train=y_train.cpu().numpy(), samples=samples)