def test_matmul(self): # Forward res = CholLazyTensor(self.chol).matmul(self.vecs) actual = self.actual_mat.matmul(self.vecs_copy) self.assertTrue(approx_equal(res, actual)) # Backward grad_output = torch.randn(*self.vecs.size()) res.backward(gradient=grad_output) actual.backward(gradient=grad_output) self.assertTrue(approx_equal(self.chol.grad, self.chol_copy.grad)) self.assertTrue(approx_equal(self.vecs.grad, self.vecs_copy.grad))
def test_diag(self): res = CholLazyTensor(self.chol).diag() actual = torch.cat([ self.actual_mat[0].diag().unsqueeze(0), self.actual_mat[1].diag().unsqueeze(0) ], 0) self.assertTrue(approx_equal(res, actual))
def untransform_posterior(self, posterior: Posterior) -> Posterior: r"""Un-standardize the posterior. Args: posterior: A posterior in the standardized space. Returns: The un-standardized posterior. If the input posterior is a MVN, the transformed posterior is again an MVN. """ if self._outputs is not None: raise NotImplementedError( "Standardize does not yet support output selection for " "untransform_posterior" ) if not self._m == posterior.event_shape[-1]: raise RuntimeError( "Incompatible output dimensions encountered for transform " f"{self._m} and posterior {posterior.event_shape[-1]}" ) if not isinstance(posterior, GPyTorchPosterior): # fall back to TransformedPosterior return TransformedPosterior( posterior=posterior, sample_transform=lambda s: self.means + self.stdvs * s, mean_transform=lambda m, v: self.means + self.stdvs * m, variance_transform=lambda m, v: self._stdvs_sq * v, ) # GPyTorchPosterior (TODO: Should we Lazy-evaluate the mean here as well?) mvn = posterior.mvn offset = self.means scale_fac = self.stdvs if not posterior._is_mt: mean_tf = offset.squeeze(-1) + scale_fac.squeeze(-1) * mvn.mean scale_fac = scale_fac.squeeze(-1).expand_as(mean_tf) else: mean_tf = offset + scale_fac * mvn.mean reps = mean_tf.shape[-2:].numel() // scale_fac.size(-1) scale_fac = scale_fac.squeeze(-2) if mvn._interleaved: scale_fac = scale_fac.repeat(*[1 for _ in scale_fac.shape[:-1]], reps) else: scale_fac = torch.repeat_interleave(scale_fac, reps, dim=-1) if ( not mvn.islazy # TODO: Figure out attribute namming weirdness here or mvn._MultivariateNormal__unbroadcasted_scale_tril is not None ): # if already computed, we can save a lot of time using scale_tril covar_tf = CholLazyTensor(mvn.scale_tril * scale_fac.unsqueeze(-1)) else: lcv = mvn.lazy_covariance_matrix # allow batch-evaluation of the model scale_mat = DiagLazyTensor(scale_fac.expand(lcv.shape[:-1])) covar_tf = scale_mat @ lcv @ scale_mat kwargs = {"interleaved": mvn._interleaved} if posterior._is_mt else {} mvn_tf = mvn.__class__(mean=mean_tf, covariance_matrix=covar_tf, **kwargs) return GPyTorchPosterior(mvn_tf)
def create_lazy_tensor(self): chol = torch.tensor( [[3, 0, 0, 0, 0], [-1, 2, 0, 0, 0], [1, 4, 1, 0, 0], [0, 2, 3, 2, 0], [-4, -2, 1, 3, 4]], dtype=torch.float, requires_grad=True, ) return CholLazyTensor(TriangularLazyTensor(chol))
def test_inv_quad_log_det(self): # Forward res_inv_quad, res_log_det = CholLazyTensor(self.chol).inv_quad_log_det( inv_quad_rhs=self.vecs, log_det=True) res = res_inv_quad + res_log_det actual_inv_quad = self.actual_mat.inverse().matmul(self.vecs_copy).mul( self.vecs_copy).sum() actual = actual_inv_quad + torch.log(torch.det(self.actual_mat)) self.assertLess(((res - actual) / actual).abs().item(), 1e-2)
def test_natgrad(self, D=5): mu = torch.randn(D) cov = torch.randn(D, D).tril_() dist = MultivariateNormal(mu, CholLazyTensor(TriangularLazyTensor(cov))) sample = dist.sample() v_dist = NaturalVariationalDistribution(D) v_dist.initialize_variational_distribution(dist) mu = v_dist().mean.detach() v_dist().log_prob(sample).squeeze().backward() eta1 = mu.clone().requires_grad_(True) eta2 = (mu[:, None] * mu + cov @ cov.t()).requires_grad_(True) L = torch.cholesky(eta2 - eta1[:, None] * eta1) dist2 = MultivariateNormal(eta1, CholLazyTensor(TriangularLazyTensor(L))) dist2.log_prob(sample).squeeze().backward() assert torch.allclose(v_dist.natural_vec.grad, eta1.grad) assert torch.allclose(v_dist.natural_mat.grad, eta2.grad)
def test_invertible_init(self, D=5): mu = torch.randn(D) cov = torch.randn(D, D).tril_() dist = MultivariateNormal(mu, CholLazyTensor(TriangularLazyTensor(cov))) v_dist = TrilNaturalVariationalDistribution(D, mean_init_std=0.0) v_dist.initialize_variational_distribution(dist) out_dist = v_dist() assert torch.allclose(out_dist.mean, dist.mean) assert torch.allclose(out_dist.covariance_matrix, dist.covariance_matrix)
def _get_bound_for_point_samples(self, X, Y, samples): """ Helper function for getting the ELBO when only a subset of inducing points are present in each layer, as given by argument samples """ def _update_inducing_points(gp, Z, m, S=None, L=None): # Setting the inducing points for a given layer assert (S is None) ^ (L is None) strat = gp.variational_strategy dist = strat._variational_distribution strat.inducing_points.data = Z dist.variational_mean.data = m if S is not None: L = psd_safe_cholesky(S) dist.chol_variational_covar.data = L gp.clear_caches() # Save current parameters initial_params = {} for n, gp in self.named_gps: var_dist = gp.variational_strategy.variational_distribution initial_params[n] = { "Z": gp.inducing_inputs.clone(), "m": var_dist.mean.clone(), "L": var_dist.scale_tril } # Remove points according to provided point samples for (n, gp), sample in zip(self.named_gps, samples): # Include only the selected inducing points for the given sample Z_ = initial_params[n]["Z"][sample] m_ = initial_params[n]["m"][..., sample] L = initial_params[n]["L"] S = CholLazyTensor(L).add_jitter().evaluate() S = S[..., sample, :][..., :, sample] _update_inducing_points(gp, Z_, m_, S=S) # Evaluate bound X_scaled = self.X_scaler.transform(X) Y_scaled = self.Y_scaler.transform(Y) bound = self._log_lik(X=X_scaled, Y=Y_scaled) - self._KL() # Restore parameters for n, gp in self.named_gps: Z = initial_params[n]["Z"] m = initial_params[n]["m"] L = initial_params[n]["L"] _update_inducing_points(gp, Z, m, L=L) return bound
def create_lazy_tensor(self): chol = torch.tensor( [ [[3, 0, 0, 0, 0], [-1, 2, 0, 0, 0], [1, 4, 1, 0, 0], [0, 2, 3, 2, 0], [-4, -2, 1, 3, 4]], [[2, 0, 0, 0, 0], [3, 1, 0, 0, 0], [-2, 3, 2, 0, 0], [-2, 1, -1, 3, 0], [-4, -4, 5, 2, 3]], ], dtype=torch.float, ) chol.add_(torch.eye(5).unsqueeze(0)) chol.requires_grad_(True) return CholLazyTensor(TriangularLazyTensor(chol))
def test_inv_quad_log_det(self): # Forward res_inv_quad, res_log_det = CholLazyTensor(self.chol).inv_quad_log_det( inv_quad_rhs=self.vecs, log_det=True) res = res_inv_quad + res_log_det actual_inv_quad = self.actual_mat_inv.matmul(self.vecs_copy).mul( self.vecs_copy).sum(-1).sum(-1) actual_log_det = torch.tensor([ torch.log(torch.det(self.actual_mat[0])), torch.log(torch.det(self.actual_mat[1])) ]) actual = actual_inv_quad + actual_log_det self.assertLess(torch.max((res - actual).abs() / actual.norm()), 1e-2)
def variational_output(self): chol_variational_covar = self.chol_variational_covar # Negate each row with a negative diagonal (the Cholesky decomposition # of a matrix requires that the diagonal elements be positive). if chol_variational_covar.ndimension() == 2: chol_variational_covar = chol_variational_covar.triu() inside = chol_variational_covar.diag().sign().unsqueeze( 1).expand_as(chol_variational_covar).triu() elif chol_variational_covar.ndimension() == 3: batch_size, diag_size, _ = chol_variational_covar.size() # Batch mode chol_variational_covar_size = list( chol_variational_covar.size())[-2:] mask = torch.ones(*chol_variational_covar_size, dtype=chol_variational_covar.dtype, device=chol_variational_covar.device).triu_() mask = mask.unsqueeze(0).expand( *([chol_variational_covar.size(0)] + chol_variational_covar_size)) batch_index = torch.arange(0, batch_size, dtype=torch.long, device=mask.device) batch_index = batch_index.unsqueeze(1).repeat(1, diag_size).view(-1) diag_index = torch.arange(0, diag_size, dtype=torch.long, device=mask.device) diag_index = diag_index.unsqueeze(1).repeat(batch_size, 1).view(-1) diag = chol_variational_covar[batch_index, diag_index, diag_index].view( batch_size, diag_size) chol_variational_covar = chol_variational_covar.mul(mask) inside = diag.sign().unsqueeze(-1).expand_as( chol_variational_covar).mul(mask) else: raise RuntimeError( "Invalid number of variational covar dimensions") chol_variational_covar = inside.mul(chol_variational_covar) variational_covar = CholLazyTensor( chol_variational_covar.transpose(-1, -2)) return GaussianRandomVariable(self.variational_mean, variational_covar)
def create_lazy_tensor(self): chol = torch.tensor( [ [[3, 0, 0, 0, 0], [-1, 2, 0, 0, 0], [1, 4, 1, 0, 0], [0, 2, 3, 2, 0], [-4, -2, 1, 3, 4]], [[2, 0, 0, 0, 0], [3, 1, 0, 0, 0], [-2, 3, 2, 0, 0], [-2, 1, -1, 3, 0], [-4, -4, 5, 2, 3]], ], dtype=torch.float, ) chol = chol.repeat(3, 1, 1, 1) chol[1].mul_(2) chol[2].mul_(0.5) chol.add_(torch.eye(5).unsqueeze_(0).unsqueeze_(0)) chol.requires_grad_(True) return CholLazyTensor(chol)
def test_natgrad(self, D=5): mu = torch.randn(D) cov = torch.randn(D, D) cov = cov @ cov.t() dist = MultivariateNormal( mu, CholLazyTensor(TriangularLazyTensor(torch.linalg.cholesky(cov)))) sample = dist.sample() v_dist = TrilNaturalVariationalDistribution(D, mean_init_std=0.0) v_dist.initialize_variational_distribution(dist) v_dist().log_prob(sample).squeeze().backward() dout_dnat1 = v_dist.natural_vec.grad dout_dnat2 = v_dist.natural_tril_mat.grad # mean_init_std=0. because we need to ensure both have the same distribution v_dist_ref = NaturalVariationalDistribution(D, mean_init_std=0.0) v_dist_ref.initialize_variational_distribution(dist) v_dist_ref().log_prob(sample).squeeze().backward() dout_dnat1_noforward_ref = v_dist_ref.natural_vec.grad dout_dnat2_noforward_ref = v_dist_ref.natural_mat.grad def f(natural_vec, natural_tril_mat): "Transform natural_tril_mat to L" Sigma = torch.inverse(-2 * natural_tril_mat) mu = natural_vec return mu, torch.linalg.cholesky(Sigma).inverse().tril() (mu_ref, natural_tril_mat_ref), (dout_dmu_ref, dout_dnat2_ref) = jvp( f, (v_dist_ref.natural_vec.detach(), v_dist_ref.natural_mat.detach()), (dout_dnat1_noforward_ref, dout_dnat2_noforward_ref), ) assert torch.allclose(natural_tril_mat_ref, v_dist.natural_tril_mat), "Sigma transformation" assert torch.allclose(dout_dnat2_ref, dout_dnat2), "Sigma gradient" assert torch.allclose(mu_ref, v_dist.natural_vec), "mu transformation" assert torch.allclose(dout_dmu_ref, dout_dnat1), "mu gradient"
def lazy_covariance_matrix(self): """Get lazy covariance matrix.""" return CholLazyTensor(torch.diag_embed(self.variance))
def test_inv_matmul(self): # Forward res = CholLazyTensor(self.chol).inv_matmul(self.vecs) actual = self.actual_mat.inverse().matmul(self.vecs_copy) self.assertLess(torch.max((res - actual).abs() / actual.norm()), 1e-2)
def forward(self): m = self.variational_mean L = self.chol_variational_covar return MultivariateNormal(m, CholLazyTensor(L))
def forward(self, x): """Forward propagate the module. This method determines how to marginalize out the inducing function values. Specifically, forward defines how to transform a variational distribution over the inducing point values, q(u), in to a variational distribution over the function values at specified locations x, q(f|x), by integrating p(f|x, u)q(u)du Parameters ---------- x (torch.tensor): Locations x to get the variational posterior of the function values at. Returns ------- The distribution q(f|x) """ variational_dist = self.variational_distribution.approx_variational_distribution inducing_points = self.inducing_points inducing_batch_shape = inducing_points.shape[:-2] if inducing_batch_shape < x.shape[:-2] or len( inducing_batch_shape) < len(x.shape[:-2]): batch_shape = _mul_broadcast_shape(inducing_points.shape[:-2], x.shape[:-2]) inducing_points = inducing_points.expand( *batch_shape, *inducing_points.shape[-2:]) x = x.expand(*batch_shape, *x.shape[-2:]) variational_dist = variational_dist.expand(batch_shape) # If our points equal the inducing points, we're done if torch.equal(x, inducing_points): return variational_dist # Otherwise, we have to marginalize else: num_induc = inducing_points.size(-2) full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix # Mean terms test_mean = full_mean[..., num_induc:] induc_mean = full_mean[..., :num_induc] mean_diff = (variational_dist.mean - induc_mean).unsqueeze(-1) # Covariance terms induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] aux = variational_dist.lazy_covariance_matrix.root_decomposition() root_variational_covar = aux.root.evaluate() # If we had to expand the inducing points, # shrink the inducing mean and induc_induc_covar dimension # This makes everything more computationally efficient if len(inducing_batch_shape) < len(induc_induc_covar.batch_shape): index = tuple(0 for _ in range( len(induc_induc_covar.batch_shape) - len(inducing_batch_shape))) repeat_size = torch.Size( (tuple(induc_induc_covar.batch_shape[:len(index)]) + tuple( 1 for _ in induc_induc_covar.batch_shape[len(index):]))) induc_induc_covar = BatchRepeatLazyTensor( induc_induc_covar.__getitem__(index), repeat_size) # If we're less than a certain size, we'll compute the Cholesky # decomposition of induc_induc_covar cholesky = False if settings.fast_computations.log_prob.off() or ( num_induc <= settings.max_cholesky_size.value()): induc_induc_covar = CholLazyTensor( induc_induc_covar.cholesky()) cholesky = True # If we are making predictions and don't need variances, we can do things # very quickly. if not self.training and settings.skip_posterior_variances.on(): if not hasattr(self, "_mean_cache"): self._mean_cache = induc_induc_covar.inv_matmul( mean_diff).detach() predictive_mean = torch.add( test_mean, induc_data_covar.transpose(-2, -1).matmul( self._mean_cache).squeeze(-1)) predictive_covar = ZeroLazyTensor(test_mean.size(-1), test_mean.size(-1)) return MultivariateNormal(predictive_mean, predictive_covar) # Cache the CG results # For now: run variational inference without a preconditioner # The preconditioner screws things up for some reason with settings.max_preconditioner_size(0): # Cache the CG results left_tensors = torch.cat([mean_diff, root_variational_covar], -1) with torch.no_grad(): eager_rhs = torch.cat([left_tensors, induc_data_covar], -1) solve, probe_vecs, probe_vec_norms, probe_vec_solves, tmats = \ CachedCGLazyTensor.precompute_terms( induc_induc_covar, eager_rhs.detach(), logdet_terms=(not cholesky), include_tmats=(not settings.skip_logdet_forward.on() and not cholesky) ) eager_rhss = [ eager_rhs.detach(), eager_rhs[..., left_tensors.size(-1):].detach(), eager_rhs[..., :left_tensors.size(-1)].detach() ] solves = [ solve.detach(), solve[..., left_tensors.size(-1):].detach(), solve[..., :left_tensors.size(-1)].detach() ] if settings.skip_logdet_forward.on(): eager_rhss.append( torch.cat([probe_vecs, left_tensors], -1)) solves.append( torch.cat([ probe_vec_solves, solve[..., :left_tensors.size(-1)] ], -1)) induc_induc_covar = CachedCGLazyTensor( induc_induc_covar, eager_rhss=eager_rhss, solves=solves, probe_vectors=probe_vecs, probe_vector_norms=probe_vec_norms, probe_vector_solves=probe_vec_solves, probe_vector_tmats=tmats, ) if self.training: self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) # Compute predictive mean/covariance inv_products = induc_induc_covar.inv_matmul( induc_data_covar, left_tensors.transpose(-1, -2)) predictive_mean = torch.add(test_mean, inv_products[..., 0, :]) predictive_covar = RootLazyTensor(inv_products[..., 1:, :].transpose( -1, -2)) if self.training: interp_data_data_var, _ = induc_induc_covar.inv_quad_logdet( induc_data_covar, logdet=False, reduce_inv_quad=False) data_covariance = DiagLazyTensor( (data_data_covar.diag() - interp_data_data_var).clamp( 0, math.inf)) else: neg_induc_data_data_covar = torch.matmul( induc_data_covar.transpose(-1, -2).mul(-1), induc_induc_covar.inv_matmul(induc_data_covar)) data_covariance = data_data_covar + neg_induc_data_data_covar predictive_covar = PsdSumLazyTensor(predictive_covar, data_covariance) return MultivariateNormal(predictive_mean, predictive_covar)
def test_diag(self): res = CholLazyTensor(self.chol).diag() actual = self.actual_mat.diag() self.assertTrue(approx_equal(res, actual))
def test_evaluate(self): res = CholLazyTensor(self.chol).evaluate() actual = self.actual_mat self.assertTrue(approx_equal(res, actual))
def test_getitem(self): res = CholLazyTensor(self.chol)[2:4, -2] actual = self.actual_mat[2:4, -2] self.assertTrue(approx_equal(res, actual))