def test_matmul_vec_random_rectangular(self): ax = torch.randn(4, 2, 3, requires_grad=True) bx = torch.randn(4, 5, 2, requires_grad=True) cx = torch.randn(4, 6, 4, requires_grad=True) rhsx = torch.randn(4, 3 * 2 * 4, 1) rhsx = (rhsx / torch.norm(rhsx)).requires_grad_(True) ax_copy = ax.clone().detach().requires_grad_(True) bx_copy = bx.clone().detach().requires_grad_(True) cx_copy = cx.clone().detach().requires_grad_(True) rhsx_copy = rhsx.clone().detach().requires_grad_(True) kp_lazy_var = KroneckerProductLazyTensor(NonLazyTensor(ax), NonLazyTensor(bx), NonLazyTensor(cx)) res = kp_lazy_var.matmul(rhsx) actual_mat = kron(kron(ax_copy, bx_copy), cx_copy) actual = actual_mat.matmul(rhsx_copy) self.assertTrue(approx_equal(res, actual)) actual.sum().backward() res.sum().backward() self.assertTrue(approx_equal(ax_copy.grad, ax.grad)) self.assertTrue(approx_equal(bx_copy.grad, bx.grad)) self.assertTrue(approx_equal(cx_copy.grad, cx.grad)) self.assertTrue(approx_equal(rhsx_copy.grad, rhsx.grad))
def make_posterior_variances(self, joint_covariance_matrix: LazyTensor) -> Tensor: r""" Computes the posterior variances given the data points X. As currently implemented, it computes another forwards call with the stacked data to get out the joint covariance across all data points. """ # TODO: use the exposed joint covariances from the prediction strategy data_joint_covariance = joint_covariance_matrix.lazy_tensors[ 0 ].evaluate_kernel() num_train = self.train_inputs[0].shape[-2] test_train_covar = data_joint_covariance[..., num_train:, :num_train] train_train_covar = data_joint_covariance[..., :num_train, :num_train] test_test_covar = data_joint_covariance[..., num_train:, num_train:] full_train_train_covar = KroneckerProductLazyTensor( train_train_covar, *joint_covariance_matrix.lazy_tensors[1:] ) full_test_test_covar = KroneckerProductLazyTensor( test_test_covar, *joint_covariance_matrix.lazy_tensors[1:] ) full_test_train_covar_list = [test_train_covar] + [ *joint_covariance_matrix.lazy_tensors[1:] ] train_evals, train_evecs = full_train_train_covar.symeig(eigenvectors=True) # (\kron \Lambda_i + \sigma^2 I)^{-1} train_inv_evals = DiagLazyTensor(1.0 / (train_evals + self.likelihood.noise)) # compute K_i S_i \hadamard K_i S_i test_train_hadamard = KroneckerProductLazyTensor( *[ lt1.matmul(lt2).evaluate() ** 2 for lt1, lt2 in zip( full_test_train_covar_list, train_evecs.lazy_tensors ) ] ) # and compute the column sums of # (\kron K_i S_i * K_i S_i) \tilde{\Lambda}^{-1} test_train_pred_covar = test_train_hadamard.matmul(train_inv_evals).sum(dim=-1) pred_variances = full_test_test_covar.diag() - test_train_pred_covar return pred_variances
def test_matmul_batch_mat(self): avar = a.repeat(3, 1, 1).requires_grad_(True) bvar = b.repeat(3, 1, 1).requires_grad_(True) cvar = c.repeat(3, 1, 1).requires_grad_(True) mat = torch.randn(3, 24, 5, requires_grad=True) kp_lazy_var = KroneckerProductLazyTensor(NonLazyTensor(avar), NonLazyTensor(bvar), NonLazyTensor(cvar)) res = kp_lazy_var.matmul(mat) avar_copy = avar.clone().detach().requires_grad_(True) bvar_copy = bvar.clone().detach().requires_grad_(True) cvar_copy = cvar.clone().detach().requires_grad_(True) mat_copy = mat.clone().detach().requires_grad_(True) actual = kron(kron(avar_copy, bvar_copy), cvar_copy).matmul(mat_copy) self.assertTrue(approx_equal(res, actual)) actual.sum().backward() res.sum().backward() self.assertTrue(approx_equal(avar_copy.grad, avar.grad)) self.assertTrue(approx_equal(bvar_copy.grad, bvar.grad)) self.assertTrue(approx_equal(cvar_copy.grad, cvar.grad)) self.assertTrue(approx_equal(mat_copy.grad, mat.grad))
def test_matmul_vec(self): avar = a.clone().requires_grad_(True) bvar = b.clone().requires_grad_(True) cvar = c.clone().requires_grad_(True) vec = torch.randn(24, requires_grad=True) kp_lazy_var = KroneckerProductLazyTensor(NonLazyTensor(avar), NonLazyTensor(bvar), NonLazyTensor(cvar)) res = kp_lazy_var.matmul(vec) avar_copy = a.clone().requires_grad_(True) bvar_copy = b.clone().requires_grad_(True) cvar_copy = c.clone().requires_grad_(True) vec_copy = vec.clone().detach().requires_grad_(True) actual = kron(kron(avar_copy, bvar_copy), cvar_copy).matmul(vec_copy) self.assertTrue(approx_equal(res, actual)) actual.sum().backward() res.sum().backward() self.assertTrue(approx_equal(avar_copy.grad, avar.grad)) self.assertTrue(approx_equal(bvar_copy.grad, bvar.grad)) self.assertTrue(approx_equal(cvar_copy.grad, cvar.grad)) self.assertTrue(approx_equal(vec_copy.grad, vec.grad))
def test_matmul_mat_random_rectangular(self): a = torch.randn(4, 2, 3, requires_grad=True) b = torch.randn(4, 5, 2, requires_grad=True) c = torch.randn(4, 6, 4, requires_grad=True) rhs = torch.randn(4, 3 * 2 * 4, 2, requires_grad=True) a_copy = a.clone().detach().requires_grad_(True) b_copy = b.clone().detach().requires_grad_(True) c_copy = c.clone().detach().requires_grad_(True) rhs_copy = rhs.clone().detach().requires_grad_(True) actual = kron(kron(a_copy, b_copy), c_copy).matmul(rhs_copy) kp_lazy_var = KroneckerProductLazyTensor(NonLazyTensor(a), NonLazyTensor(b), NonLazyTensor(c)) res = kp_lazy_var.matmul(rhs) self.assertTrue(approx_equal(res, actual)) actual.sum().backward() res.sum().backward() self.assertTrue(approx_equal(a_copy.grad, a.grad)) self.assertTrue(approx_equal(b_copy.grad, b.grad)) self.assertTrue(approx_equal(c_copy.grad, c.grad)) self.assertTrue(approx_equal(rhs_copy.grad, rhs.grad))
def posterior( self, X: Tensor, output_indices: Optional[List[int]] = None, observation_noise: Union[bool, Tensor] = False, posterior_transform: Optional[PosteriorTransform] = None, **kwargs: Any, ) -> MultitaskGPPosterior: self.eval() if posterior_transform is not None: # this could be very costly, disallow for now raise NotImplementedError( "Posterior transforms currently not supported for " f"{self.__class__.__name__}") X = self.transform_inputs(X) train_x = self.transform_inputs(self.train_inputs[0]) # construct Ktt task_covar = self._task_covar_matrix task_rootlt = self._task_covar_matrix.root_decomposition( method="diagonalization") task_root = task_rootlt.root if task_covar.batch_shape != X.shape[:-2]: task_covar = BatchRepeatLazyTensor(task_covar, batch_repeat=X.shape[:-2]) task_root = BatchRepeatLazyTensor(lazify(task_root), batch_repeat=X.shape[:-2]) task_covar_rootlt = RootLazyTensor(task_root) # construct RR' \approx Kxx data_data_covar = self.train_full_covar.lazy_tensors[0] # populate the diagonalziation caches for the root and inverse root # decomposition data_data_evals, data_data_evecs = data_data_covar.diagonalization() # pad the eigenvalue and eigenvectors with zeros if we are using lanczos if data_data_evecs.shape[-1] < data_data_evecs.shape[-2]: cols_to_add = data_data_evecs.shape[-2] - data_data_evecs.shape[-1] zero_evecs = torch.zeros( *data_data_evecs.shape[:-1], cols_to_add, dtype=data_data_evals.dtype, device=data_data_evals.device, ) zero_evals = torch.zeros( *data_data_evecs.shape[:-2], cols_to_add, dtype=data_data_evals.dtype, device=data_data_evals.device, ) data_data_evecs = CatLazyTensor( data_data_evecs, lazify(zero_evecs), dim=-1, output_device=data_data_evals.device, ) data_data_evals = torch.cat((data_data_evals, zero_evals), dim=-1) # construct K_{xt, x} test_data_covar = self.covar_module.data_covar_module(X, train_x) # construct K_{xt, xt} test_test_covar = self.covar_module.data_covar_module(X) # now update root so that \tilde{R}\tilde{R}' \approx K_{(x,xt), (x,xt)} # cloning preserves the gradient history updated_lazy_tensor = data_data_covar.cat_rows( cross_mat=test_data_covar.clone(), new_mat=test_test_covar, method="diagonalization", ) updated_root = updated_lazy_tensor.root_decomposition().root # occasionally, there's device errors so enforce this comes out right updated_root = updated_root.to(data_data_covar.device) # build a root decomposition of the joint train/test covariance matrix # construct (\tilde{R} \otimes M)(\tilde{R} \otimes M)' \approx # (K_{(x,xt), (x,xt)} \otimes Ktt) joint_covar = RootLazyTensor( KroneckerProductLazyTensor(updated_root, task_covar_rootlt.root.detach())) # construct K_{xt, x} \otimes Ktt test_obs_kernel = KroneckerProductLazyTensor(test_data_covar, task_covar) # collect y - \mu(x) and \mu(X) train_diff = self.train_targets - self.mean_module(train_x) if detach_test_caches.on(): train_diff = train_diff.detach() test_mean = self.mean_module(X) train_noise = self.likelihood._shaped_noise_covar(train_x.shape) diagonal_noise = isinstance(train_noise, DiagLazyTensor) if detach_test_caches.on(): train_noise = train_noise.detach() test_noise = (self.likelihood._shaped_noise_covar(X.shape) if observation_noise else None) # predictive mean and variance for the mvn # first the predictive mean pred_mean = (test_obs_kernel.matmul( self.predictive_mean_cache).reshape_as(test_mean) + test_mean) # next the predictive variance, assume diagonal noise test_var_term = KroneckerProductLazyTensor(test_test_covar, task_covar).diag() if diagonal_noise: task_evals, task_evecs = self._task_covar_matrix.diagonalization() # TODO: make this be the default KPMatmulLT diagonal method in gpytorch full_data_inv_evals = (KroneckerProductDiagLazyTensor( DiagLazyTensor(data_data_evals), DiagLazyTensor(task_evals)) + train_noise).inverse() test_train_hadamard = KroneckerProductLazyTensor( test_data_covar.matmul(data_data_evecs).evaluate()**2, task_covar.matmul(task_evecs).evaluate()**2, ) data_var_term = test_train_hadamard.matmul( full_data_inv_evals).sum(dim=-1) else: # if non-diagonal noise (but still kronecker structured), we have to pull # across the noise because the inverse is not closed form # should be a kronecker lt, R = \Sigma_X^{-1/2} \kron \Sigma_T^{-1/2} # TODO: enforce the diagonalization to return a KPLT for all shapes in # gpytorch or dense linear algebra for small shapes data_noise, task_noise = train_noise.lazy_tensors data_noise_root = data_noise.root_inv_decomposition( method="diagonalization") task_noise_root = task_noise.root_inv_decomposition( method="diagonalization") # ultimately we need to compute the diagonal of # (K_{x* X} \kron K_T)(K_{XX} \kron K_T + \Sigma_X \kron \Sigma_T)^{-1} # (K_{x* X} \kron K_T)^T # = (K_{x* X} \Sigma_X^{-1/2} Q_R)(\Lambda_R + I)^{-1} # (K_{x* X} \Sigma_X^{-1/2} Q_R)^T # where R = (\Sigma_X^{-1/2T}K_{XX}\Sigma_X^{-1/2} \kron # \Sigma_T^{-1/2T}K_{T}\Sigma_T^{-1/2}) # first we construct the components of R's eigen-decomposition # TODO: make this be the default KPMatmulLT diagonal method in gpytorch whitened_data_covar = (data_noise_root.transpose( -1, -2).matmul(data_data_covar).matmul(data_noise_root)) w_data_evals, w_data_evecs = whitened_data_covar.diagonalization() whitened_task_covar = (task_noise_root.transpose(-1, -2).matmul( self._task_covar_matrix).matmul(task_noise_root)) w_task_evals, w_task_evecs = whitened_task_covar.diagonalization() # we add one to the eigenvalues as above (not just for stability) full_data_inv_evals = (KroneckerProductDiagLazyTensor( DiagLazyTensor(w_data_evals), DiagLazyTensor(w_task_evals)).add_jitter(1.0).inverse()) test_data_comp = (test_data_covar.matmul(data_noise_root).matmul( w_data_evecs).evaluate()**2) task_comp = (task_covar.matmul(task_noise_root).matmul( w_task_evecs).evaluate()**2) test_train_hadamard = KroneckerProductLazyTensor( test_data_comp, task_comp) data_var_term = test_train_hadamard.matmul( full_data_inv_evals).sum(dim=-1) pred_variance = test_var_term - data_var_term specialized_mvn = MultitaskMultivariateNormal( pred_mean, DiagLazyTensor(pred_variance)) if observation_noise: specialized_mvn = self.likelihood(specialized_mvn) posterior = MultitaskGPPosterior( mvn=specialized_mvn, joint_covariance_matrix=joint_covar, test_train_covar=test_obs_kernel, train_diff=train_diff, test_mean=test_mean, train_train_covar=self.train_full_covar, train_noise=train_noise, test_noise=test_noise, ) if hasattr(self, "outcome_transform"): posterior = self.outcome_transform.untransform_posterior(posterior) return posterior