def _predict(self, input_new, diag=True): # following GPflow implementation # integrating the inducing variables out if isinstance(input_new, np.ndarray): # set input_new to be volatile for inference mode input_new = Variable(th.Tensor(input_new).type(float_type), volatile=True) self.X.volatile = True self.Y.volatile = True self.Z.volatile = True num_inducing = self.Z.size(0) dim_output = self.Y.size(1) # err = self.Y - self.mean_function(self.X) err = self.Y # Kff_diag = self.kernel.Kdiag(self.X) Kuf = self.kernel.K(self.Z, self.X) # add jitter # Kuu = self.kernel.K(self.Z) + Variable(th.eye(num_inducing).float() * 1e-5) Kuu = self.kernel.K( self.Z) + self.jitter.transform().expand(num_inducing).diag() Kus = self.kernel.K(self.Z, input_new) L = cholesky(Kuu) A = trtrs(L, Kuf) AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu) B = AAT + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B) # divide variance at the end c = trtrs(LB, A.mm(err)) \ / self.likelihood.variance.transform().expand(num_inducing, dim_output) tmp1 = trtrs(L, Kus) tmp2 = trtrs(LB, tmp1) mean = tmp2.t().mm(c) if diag: var = self.kernel.Kdiag(input_new) - tmp1.pow(2).sum(0).squeeze() \ + tmp2.pow(2).sum(0).squeeze() # add kronecker product later for multi-output case else: var = self.kernel.K(input_new) + tmp2.t().mm(tmp2) \ - tmp1.t().mm(tmp1) # return mean + self.mean_function(input_new), var return mean, var
def compute_loss(self): """ Computes the variational lower bound of the true log marginal likelihood Eqn (9) in Titsias, Michalis K. "Variational Learning of Inducing Variables in Sparse Gaussian Processes." AISTATS. Vol. 5. 2009. """ num_inducing = self.Z.size(0) num_training = self.X.size(0) dim_output = self.Y.size(1) # TODO: add mean_functions # err = self.Y - self.mean_function(self.X) err = self.Y Kff_diag = self.kernel.Kdiag(self.X) Kuf = self.kernel.K(self.Z, self.X) # add jitter Kuu = self.kernel.K(self.Z) + \ self.jitter.transform().expand(num_inducing).diag() L = cholesky(Kuu) A = trtrs(L, Kuf) AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu) B = AAT + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B) # divide variance at the end c = trtrs(LB, A.mm(err)) \ / self.likelihood.variance.transform().expand(num_inducing, dim_output) # Evidence lower bound elbo = Variable( th.Tensor([-0.5 * dim_output * num_training * np.log(2 * np.pi) ]).type(float_type)) elbo -= dim_output * LB.diag().log().sum() elbo -= 0.5 * dim_output * num_training * self.likelihood.variance.transform( ).log() elbo -= 0.5 * (err.pow(2).sum() + dim_output * Kff_diag.sum()) \ / self.likelihood.variance.transform() elbo += 0.5 * c.pow(2).sum() elbo += 0.5 * dim_output * AAT.diag().sum() return -elbo
def _predict(self, input_new, diag, full_cov_size_limit=10000): """ This method computes .. math:: p(F^* | Y ) where F* are points on the GP at input_new, Y are observations at the input X of the training data. :param input_new: assume to be numpy array, but should be in two dimensional """ if isinstance(input_new, np.ndarray): # output is a data matrix, rows correspond to the rows in input, # columns are treated independently input_new = Variable(th.Tensor(input_new).type(tensor_type), requires_grad=False, volatile=True) k_ys = self.kernel.K(self.X, input_new) kyy = self._compute_kyy() L = cholesky(kyy) A = trtrs(L, k_ys) V = trtrs(L, self.Y) mean_f = th.mm(th.transpose(A, 0, 1), V) if self.mean_function is not None: mean_f += self.mean_function(input_new) var_f_1 = self.kernel.Kdiag(input_new) if diag else \ self.kernel.K(input_new) # Kss if diag: var_f_2 = th.sum(A * A, 0) else: var_f_2 = th.mm(A.t(), A) var_f = var_f_1 - var_f_2 return mean_f, var_f
def compute_loss(self): """ Loss is equal to the negative of the log likelihood Adapted from Rasmussen & Williams, GPML (2006), p. 19, Algorithm 2.1. """ num_input = self.Y.size(0) dim_output = self.Y.size(1) L = cholesky(self._compute_kyy()) alpha = trtrs(L, self.Y) const = Variable(th.Tensor([-0.5 * dim_output * num_input * \ np.log(2 * np.pi)]).type(tensor_type)) loss = 0.5 * alpha.pow(2).sum() + dim_output * lt_log_determinant(L) \ - const return loss return None if self.mean_function is None \ else self.mean_function(self.X)
def eKxz_parallel(self, Z, Xmean, Xcov): # TODO: add test """Parallel implementation (needs more space, but less time) Refer to GPflow implementation Args: Args: Z (Variable): m x q inducing input Xmean (Variable): n x q mean of input X Xcov (Varible): posterior covariance of X two sizes are accepted: n x q x q: each q(x_i) has full covariance n x q: each q(x_i) has diagonal covariance (uncorrelated), stored in each row Returns: (Variable): n x m """ # Revisit later, check for backward support for n-D tensor n = Xmean.size(0) q = Xmean.size(1) m = Z.size(0) if Xcov.dim() == 2: # from flattered diagonal to full matrix cov = Variable(th.Tensor(n, q, q).type(float_type)) for i in range(Xmean.size(0)): cov[i] = Xcov[i].diag() Xcov = cov del cov length_scales = self.length_scales.transform() Lambda = length_scales.pow(2).diag().unsqueeze(0).expand_as(Xcov) L = cholesky(Lambda + Xcov) xz = Xmean.unsqueeze(2).expand(n, q, m) - Z.unsqueeze(0).expand( n, q, m) Lxz = trtrs(L, xz) half_log_dets = L.diag().log().sum(1) \ - length_scales.log().sum().expand(n) return self.variance.transform().expand(n, m) \ * th.exp(-0.5 * Lxz.pow(2).sum(1) - half_log_dets.expand(n, m))
def _predict(self, Xnew_mean, Xnew_var=None, diag=True): """Computes the mean and variance of latent function output corresponding to the new (uncertain) input The new input can be deterministic or uncertain (only Gaussian: mean and variance). Returns the predictions over all dimensions (extract the needed dimensions for imputation case after getting the returns) Args: Xnew_mean (np.ndarray): new latent input, it is the deterministic input if ``input_var`` is None, otherwise it is the mean of the latent posterior, size n_* x q Xnew_var (np.ndarray): variance (covariance) of latent posterior, iid case, still n_* x q (each row stores the diagonal of cov) Returns: (Variables): n_* x p, mean of the predicted latent output (Variables): covariance of the predicted latent output, n_* x p for the deterministic case (share the same covariance), or n_* x q x q for the uncertain Gaussian input, iid. """ assert isinstance( Xnew_mean, np.ndarray) and Xnew_mean.shape[1] == self.Xmean.size(1), ( "Input_mean should be numpy.ndarary, and its column dims " "should be same as the latent dimensions") Xnew_mean = Variable(th.Tensor(Xnew_mean).type(float_type), volatile=True) num_inducing = self.Z.size(0) beta = 1.0 / self.likelihood.variance.transform() # Psi1, Psi2 eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) Kzs = self.kernel.K(self.Z, Xnew_mean) Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="Lkz") A = trtrs(L, trtrs(L, eKzxKxz).t()) * beta.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) Lb = cholesky(B, flag="Lb") C = trtrs(L, Kzs) D = trtrs(Lb, C) if Xnew_var is None: # broadcast udpated mean = D.t().mm(trtrs(Lb, trtrs( L, eKxz.t().mm(self.Y)))) * beta.expand(Xnew_mean.size(0), self.Y.size(1)) # return full covariance or only the diagonal if diag: # 1d tensor var = (self.kernel.Kdiag(Xnew_mean) - C.pow(2).sum(0).squeeze() + D.pow(2).sum(0).squeeze()) else: var = self.kernel.K(Xnew_mean) - C.t().mm(C) + D.t().mm(D) else: # uncertain input, assume Gaussian. assert (isinstance(Xnew_var, np.ndarray) and Xnew_var.shape == Xnew_var.shape), ( "Uncertain input, inconsistent variance size, " "should be numpy ndarray") Xnew_var = Param(th.Tensor(Xnew_var).type(float_type)) Xnew_var.requires_transform = True Xnew_var.volatile = True # s for star (new input), z for inducing input eKsz = self.kernel.eKxz(self.Z, Xnew_mean, Xnew_var) # list of n_* expectations w.r.t. each test datum eKzsKsz = self.kernel.eKzxKxz(self.Z, Xnew_mean, Xnew_var, sum=False) Im = Variable(th.eye(self.Z.size(0)).type(float_type)) E = trtrs(Lb, trtrs(L, Im)) EtE = E.t().mm(E) F = EtE.mm(eKxz.t().mm(self.Y)) * beta.expand( self.Z.size(0), self.Y.size(1)) mean = eKsz.mm(F) Linv = trtrs(L, Im) Sigma = Linv.t().mm(Linv) - EtE # n x m x m # eKzsKsz = eKzsKsz.cat(0).view(Xnew_mean.size(0), *self.Z.size()) var = [] if diag: ns = Xnew_mean.size(0) p = self.Y.size(1) # vectorization? for i in range(ns): cov = (self.kernel.variance.transform() - Sigma.mm( eKzsKsz[i]).trace()).expand( p, p) + F.t().mm(eKzsKsz[i] - eKsz[i, :].unsqueeze( 0).t().mm(eKsz[i, :].unsqueeze(0))).mm(F) var.append(cov) else: # full covariance case, leave for future print("multi-output case, future feature") var = None pass return mean, var
def log_likelihood_inference(self): """Computes the loss in the inference mode, e.g. for projection. Handles both fully observed and partially observed data. Only iid latent is implemented. """ num_data_train = self.Y.size(0) # dim_output_train = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) num_data_test = self.Y_test.size(0) # total number of data for inference num_data = num_data_train + num_data_test # dimension of output in the test time dim_output = self.Y_test.size(1) # whole data for inference if self.observed_dims is None: Y = th.cat((self.Y, self.Y_test), 0) else: Y = th.cat( (self.Y.index_select(1, self.observed_dims), self.Y_test), 0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations # eKxx = num_data * self.kernel.eKxx(self.Xmean).sum() eKxx = num_data * var_kernel if self.data_type == "iid": eKxz_test = self.kernel.eKxz(self.Z, self.Xmean_test, self.Xcov_test) eKzxKxz_test = self.kernel.eKzxKxz(self.Z, self.Xmean_test, self.Xcov_test) eKxz = th.cat((self.saved_terms["eKxz"], eKxz_test), 0) eKzxKxz = self.saved_terms["eKzxKxz"] + eKzxKxz_test else: print("regressive case not implemented") # compute ELBO L = self.saved_terms["L"] A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag="LB") log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.diag().sum()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(Y) D = trtrs(LB, trtrs(L, C)) elbo -= (0.5 * (Y.t().mm(Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace()) else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand( num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand( num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == "iid": const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) elbo -= KLD return elbo
def log_likelihood(self): """ Computation graph for the ELBO (Evidence Lower Bound) of the variational GPLVM For the implementation details, please see ``notes/impl_gplvm``. """ num_data = self.Y.size(0) dim_output = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations eKxx = num_data * var_kernel if self.data_type == "iid": eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) else: # seq data # compute S_j's and mu_bar_j's (reparameterization: forward) # self.Xmean, self.Xcov = self._reparam_vargp(self.Xmean_bar, self.Lambda) Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # print(Kx.data.eig()) Lkx = cholesky(Kx, flag="Lkx") # Kx_inverse = inverse(Kx) self.Xmean = Kx.mm(self.Xmean_bar) Xcov = [] # S = [] Le = [] In = Variable(th.eye(num_data).type(float_type)) for j in xrange(dim_latent): Ej = Lkx.t().mm(self.Lambda.transform()[:, j].diag()).mm(Lkx) + In # print(Ej.data.eig()) Lej = cholesky(Ej, flag="Lej") Lsj = trtrs(Lej, Lkx.t()).t() Sj = Lsj.mm(Lsj.t()) Xcov.append(Sj.diag().unsqueeze(1)) # S.append(Sj) Le.append(Lej) self.Xcov = th.cat(Xcov, 1) eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov, False) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov, False) # compute ELBO # add jitter # broadcast update Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="Lkz") A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag="LB") # log|B| # log_det_b = LB.diag().log().sum() log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.trace()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(self.Y) D = trtrs(LB, trtrs(L, C)) elbo -= (0.5 * (self.Y.t().mm(self.Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace()) else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand( num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand( num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == "iid": const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) else: # seq data (3.29) p58 # Xmean n x q # S: q x n x n # Kx, Kx_inverse KLD = Variable( th.Tensor([-0.5 * num_data * dim_latent]).type(float_type)) KLD += 0.5 * self.Xmean_bar.mm(self.Xmean_bar.t()).mm( Kx.t()).trace() for j in xrange(dim_latent): Lej_inv = trtrs(Le[j], In) KLD += 0.5 * Lej_inv.t().mm(Lej_inv).trace() + Le[j].diag( ).log().sum() elbo -= KLD return elbo