def project(self, observ_test, observed_dims=None): """Infers the latent input corresponding to the new observed data The test data can be partially observed. # TODO: Currently only the Gaussian approximations, and iid case. With recognition model, inference of latent would be faster. Args: observ_test (numpy.ndarray): Test observed data observed_dims (list or np.array, 1D): Observed dimensions of the partially observed test data. Must be provided for partially observed test case. Returns: mean and variance of the posterior of Gaussian approximations """ # Set the modes to be the inference mode self.inference = True if observed_dims is None: # Fully observable data assert observ_test.shape[1] == self.Y.size(1), ( "Test data dimension must equal to that of the training " "data for the fully observed case, otherwise please " "specify the observed dimensions using ``observed_dims``") else: assert isinstance(observed_dims, (basestring, np.ndarray)), ( "Type of the list of observed dimensions should be list " "or 1d np.array") self.observed_dims = Variable(th.LongTensor(observed_dims)) assert isinstance(observ_test, np.ndarray), "Test data should be " "np.ndarray" if observ_test.ndim == 1: observ_test = observ_test[None, :] # Design choice: do not create a tiny inference model, but reuse the # trained model using another function for compute the loss. # Add new observation variables to the original class self.Y_test = Variable(th.Tensor(observ_test).type(float_type)) # Freeze the trained parameters for param in self.parameters(): param.requires_grad = False # initialize Xmean_test, Xcov_test by searching for the nearest # neighbour in the data space if observed_dims is None: Y_observed = self.Y else: Y_observed = self.Y.index_select(1, self.observed_dims) YYT = self.Y_test.mm(Y_observed.t()) dist_matrix = (-2 * YYT + self.Y_test.pow(2).sum(1).expand_as(YYT) + Y_observed.t().pow(2).sum(0).expand_as(YYT)) _, argmin = dist_matrix.min(1) argmin = argmin.view(self.Y_test.size(0)).data self.Xmean_test = Param(self.Xmean.data[argmin]) self.Xcov_test = Param(self.Xcov.transform().data[argmin], requires_transform=True) print("GPLVM: Finish preparing the model for projection") self._pre_compute() print("GPLVM: Done with pre-computation. \nPlease optimize the model" " again to obtain the projected latent variables\n")
def test_init(self): # Test various permitted inits: Param(th.DoubleTensor([1.0])) Param(th.DoubleTensor([1.0]), requires_grad=False) Param(th.DoubleTensor([1.0]), requires_transform=False) Param(th.DoubleTensor([1.0]), requires_grad=False, requires_transform=False)
def test_transform(self): """ Test that parameters requiring a transform return the correct value. Currently, we obtain the untransformed variable by default. Perhaps we should switch this in the future. """ p = Param(th.DoubleTensor([1.0])) assert p.data.numpy()[0] == 1.0 pt = Param(th.DoubleTensor([1.0]), requires_transform=True) assert p.transform().data.numpy()[0] == 1.0
def test_access(self): """ Test accessing the value. """ p = Param(th.DoubleTensor([1.0])) assert isinstance(p.data, th.DoubleTensor) assert isinstance(p.data.numpy(), np.ndarray)
def __init__(self, observations, input, kernel, inducing_points=None, num_inducing=None, mean_function=None, name='variational_gp'): """ Assume Gaussian likelihood Args: observations (np.ndarray): Y, n x p input (np.ndarray): X, n x q kernel (gptorch.Kernel): inducing_points (np.ndarray, optional): Z, m x q num_inducing (int), optional): number of inducing inputs Input, observations, and kernel must be specified, if both ``inducing_points`` and ``num_inducing`` are not set, 1/10 th of total points will be draw randomly from input as the inducing points. """ likelihood = Gaussian() super(VFE, self).__init__(observations, input, kernel, likelihood, mean_function, name) if inducing_points is not None: if isinstance(inducing_points, np.ndarray): # inducing points are free variational parameters, no constraints # However, it is possible those points are optimized to outer regions inducing_points = Param( th.Tensor(inducing_points).type(float_type)) else: if num_inducing is None: num_inducing = np.max([len(input) // 10, 1]) # randomly select num_inducing points from input indices = np.arange(len(input)) np.random.shuffle(indices) inducing_points = Param(th.Tensor(input[indices[:num_inducing]]).\ type(float_type)) self.jitter = Param(th.Tensor([1e-4]).type(float_type), requires_transform=True) # Z stands for inducing points as standard in the literature self.Z = inducing_points
def _embed_posterior(self, x, epsilon): for xi in x: if xi not in self.loc: # Randomly initialize to break symmetry and prevent posteriors from # starting in the same spot self._loc[xi] = torch.nn.Parameter({ "random": torch.randn, "prior": torch.zeros }[self.unseen_policy](self.d_out)) self._scale[xi] = Param({ "random": lambda s: 0.1 * torch.ones(s), "prior": torch.ones }[self.unseen_policy](self.d_out), transform=ExpTransform()) # NB "epsilon" takes care of whether self.random or not. return torch.stack([ self.loc[xi] + self.scale[xi].transform() * epsilon[xi] for xi in x ])
class VFE(GPModel): """ Variational Free Energy approximation for GP Reference: Titsias, Michalis K. "Variational Learning of Inducing Variables in Sparse Gaussian Processes." AISTATS. Vol. 5. 2009. """ def __init__(self, observations, input, kernel, inducing_points=None, num_inducing=None, mean_function=None, name='variational_gp'): """ Assume Gaussian likelihood Args: observations (np.ndarray): Y, n x p input (np.ndarray): X, n x q kernel (gptorch.Kernel): inducing_points (np.ndarray, optional): Z, m x q num_inducing (int), optional): number of inducing inputs Input, observations, and kernel must be specified, if both ``inducing_points`` and ``num_inducing`` are not set, 1/10 th of total points will be draw randomly from input as the inducing points. """ likelihood = Gaussian() super(VFE, self).__init__(observations, input, kernel, likelihood, mean_function, name) if inducing_points is not None: if isinstance(inducing_points, np.ndarray): # inducing points are free variational parameters, no constraints # However, it is possible those points are optimized to outer regions inducing_points = Param( th.Tensor(inducing_points).type(float_type)) else: if num_inducing is None: num_inducing = np.max([len(input) // 10, 1]) # randomly select num_inducing points from input indices = np.arange(len(input)) np.random.shuffle(indices) inducing_points = Param(th.Tensor(input[indices[:num_inducing]]).\ type(float_type)) self.jitter = Param(th.Tensor([1e-4]).type(float_type), requires_transform=True) # Z stands for inducing points as standard in the literature self.Z = inducing_points def compute_loss(self): """ Computes the variational lower bound of the true log marginal likelihood Eqn (9) in Titsias, Michalis K. "Variational Learning of Inducing Variables in Sparse Gaussian Processes." AISTATS. Vol. 5. 2009. """ num_inducing = self.Z.size(0) num_training = self.X.size(0) dim_output = self.Y.size(1) # TODO: add mean_functions # err = self.Y - self.mean_function(self.X) err = self.Y Kff_diag = self.kernel.Kdiag(self.X) Kuf = self.kernel.K(self.Z, self.X) # add jitter Kuu = self.kernel.K(self.Z) + \ self.jitter.transform().expand(num_inducing).diag() L = cholesky(Kuu) A = trtrs(L, Kuf) AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu) B = AAT + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B) # divide variance at the end c = trtrs(LB, A.mm(err)) \ / self.likelihood.variance.transform().expand(num_inducing, dim_output) # Evidence lower bound elbo = Variable( th.Tensor([-0.5 * dim_output * num_training * np.log(2 * np.pi) ]).type(float_type)) elbo -= dim_output * LB.diag().log().sum() elbo -= 0.5 * dim_output * num_training * self.likelihood.variance.transform( ).log() elbo -= 0.5 * (err.pow(2).sum() + dim_output * Kff_diag.sum()) \ / self.likelihood.variance.transform() elbo += 0.5 * c.pow(2).sum() elbo += 0.5 * dim_output * AAT.diag().sum() return -elbo def _predict(self, input_new, diag=True): # following GPflow implementation # integrating the inducing variables out if isinstance(input_new, np.ndarray): # set input_new to be volatile for inference mode input_new = Variable(th.Tensor(input_new).type(float_type), volatile=True) self.X.volatile = True self.Y.volatile = True self.Z.volatile = True num_inducing = self.Z.size(0) dim_output = self.Y.size(1) # err = self.Y - self.mean_function(self.X) err = self.Y # Kff_diag = self.kernel.Kdiag(self.X) Kuf = self.kernel.K(self.Z, self.X) # add jitter # Kuu = self.kernel.K(self.Z) + Variable(th.eye(num_inducing).float() * 1e-5) Kuu = self.kernel.K( self.Z) + self.jitter.transform().expand(num_inducing).diag() Kus = self.kernel.K(self.Z, input_new) L = cholesky(Kuu) A = trtrs(L, Kuf) AAT = A.mm(A.t()) / self.likelihood.variance.transform().expand_as(Kuu) B = AAT + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B) # divide variance at the end c = trtrs(LB, A.mm(err)) \ / self.likelihood.variance.transform().expand(num_inducing, dim_output) tmp1 = trtrs(L, Kus) tmp2 = trtrs(LB, tmp1) mean = tmp2.t().mm(c) if diag: var = self.kernel.Kdiag(input_new) - tmp1.pow(2).sum(0).squeeze() \ + tmp2.pow(2).sum(0).squeeze() # add kronecker product later for multi-output case else: var = self.kernel.K(input_new) + tmp2.t().mm(tmp2) \ - tmp1.t().mm(tmp1) # return mean + self.mean_function(input_new), var return mean, var
def _predict(self, Xnew_mean, Xnew_var=None, diag=True): """Computes the mean and variance of latent function output corresponding to the new (uncertain) input The new input can be deterministic or uncertain (only Gaussian: mean and variance). Returns the predictions over all dimensions (extract the needed dimensions for imputation case after getting the returns) Args: Xnew_mean (np.ndarray): new latent input, it is the deterministic input if ``input_var`` is None, otherwise it is the mean of the latent posterior, size n_* x q Xnew_var (np.ndarray): variance (covariance) of latent posterior, iid case, still n_* x q (each row stores the diagonal of cov) Returns: (Variables): n_* x p, mean of the predicted latent output (Variables): covariance of the predicted latent output, n_* x p for the deterministic case (share the same covariance), or n_* x q x q for the uncertain Gaussian input, iid. """ assert isinstance( Xnew_mean, np.ndarray) and Xnew_mean.shape[1] == self.Xmean.size(1), ( "Input_mean should be numpy.ndarary, and its column dims " "should be same as the latent dimensions") Xnew_mean = Variable(th.Tensor(Xnew_mean).type(float_type), volatile=True) num_inducing = self.Z.size(0) beta = 1.0 / self.likelihood.variance.transform() # Psi1, Psi2 eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) Kzs = self.kernel.K(self.Z, Xnew_mean) Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="Lkz") A = trtrs(L, trtrs(L, eKzxKxz).t()) * beta.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) Lb = cholesky(B, flag="Lb") C = trtrs(L, Kzs) D = trtrs(Lb, C) if Xnew_var is None: # broadcast udpated mean = D.t().mm(trtrs(Lb, trtrs( L, eKxz.t().mm(self.Y)))) * beta.expand(Xnew_mean.size(0), self.Y.size(1)) # return full covariance or only the diagonal if diag: # 1d tensor var = (self.kernel.Kdiag(Xnew_mean) - C.pow(2).sum(0).squeeze() + D.pow(2).sum(0).squeeze()) else: var = self.kernel.K(Xnew_mean) - C.t().mm(C) + D.t().mm(D) else: # uncertain input, assume Gaussian. assert (isinstance(Xnew_var, np.ndarray) and Xnew_var.shape == Xnew_var.shape), ( "Uncertain input, inconsistent variance size, " "should be numpy ndarray") Xnew_var = Param(th.Tensor(Xnew_var).type(float_type)) Xnew_var.requires_transform = True Xnew_var.volatile = True # s for star (new input), z for inducing input eKsz = self.kernel.eKxz(self.Z, Xnew_mean, Xnew_var) # list of n_* expectations w.r.t. each test datum eKzsKsz = self.kernel.eKzxKxz(self.Z, Xnew_mean, Xnew_var, sum=False) Im = Variable(th.eye(self.Z.size(0)).type(float_type)) E = trtrs(Lb, trtrs(L, Im)) EtE = E.t().mm(E) F = EtE.mm(eKxz.t().mm(self.Y)) * beta.expand( self.Z.size(0), self.Y.size(1)) mean = eKsz.mm(F) Linv = trtrs(L, Im) Sigma = Linv.t().mm(Linv) - EtE # n x m x m # eKzsKsz = eKzsKsz.cat(0).view(Xnew_mean.size(0), *self.Z.size()) var = [] if diag: ns = Xnew_mean.size(0) p = self.Y.size(1) # vectorization? for i in range(ns): cov = (self.kernel.variance.transform() - Sigma.mm( eKzsKsz[i]).trace()).expand( p, p) + F.t().mm(eKzsKsz[i] - eKsz[i, :].unsqueeze( 0).t().mm(eKsz[i, :].unsqueeze(0))).mm(F) var.append(cov) else: # full covariance case, leave for future print("multi-output case, future feature") var = None pass return mean, var
def __init__( self, observations, dim_latent, num_inducing, Xmean=None, inducing_points=None, kernel=None, kernel_x=None, data_type="iid", collapsed_bound=True, large_p=False, ): """ Initialization for the variational GPLVM Args: observations (np.ndarray): Observed data for unsupervised learning dim_latent (int): Dimensionality of the latent variables num_inducing (int): Number of inducing points Xmean (np.ndarray): Latent variable means (if None, will be init by PCA) inducing_points (np.ndarray): Inducing points, Z kernel (gptorch.Kernel): data_type (string): ``iid`` or ``seq`` (sequential) collapsed_bound (bool, optional): True for computing the ELBO when inducing variables are collapsed (second bound), False for the uncollapsed bound large_p (bool, optional): True for the case of small n, large p (HD video), False for the case of large p, small n. This option affects the computation of KL(q(X) || p(X)) """ warn("GPLVM is unstable and not recommended for use!") assert isinstance( observations, np.ndarray), "Observation matrix should be a np.ndarray." if Xmean is None: print("GPLVM: Initialize the Xmean using PCA") if large_p: pca_sklean = PCA(n_components=dim_latent) Xmean = pca_sklean.fit_transform(observations) else: Xmean = util.as_variable(util.PCA(observations, dim_latent)) else: assert isinstance(Xmean, np.ndarray), ( "Initialization of posterior mean of latent variables should" " be np.ndarray.") if kernel is None: kernel = ekernels.Rbf(dim_latent, ARD=True) else: assert ( dim_latent == kernel.input_dim ), "Input dimensionality of kernel must be equal to dim_latent." assert isinstance( kernel, ekernels.Rbf), "Supports only ekernel.Rbf currently." super(GPLVM, self).__init__(Xmean, observations, kernel, Gaussian(), Zero(), name="GPLVM") del self.X # flag to distinguish training and testing mode self.inference = False self.data_type = data_type self.is_collapsed = collapsed_bound self.is_large_p = large_p # Setup for test time inference # test data will be assigned in the projection method self.Y_test = None # latent variable mean and covariance for the test data self.Xmean_test = None self.Xcov_test = None # observed dimensions of the test data self.observed_dims = None self.saved_terms = {} if self.is_large_p: # saved for faster computation in the lower bound, n x n self.saved_terms["YYT"] = self.Y.mm(self.Y.t()) if self.data_type == "iid": # posterior mean of X initialized by PCA of Y: n x q # self.Xmean = Param(th.from_numpy(Xmean).type(float_type)) self.Xmean = Param(Xmean.data) # posterior covariance of X: n x q self.Xcov = Param( 0.5 * th.ones(self.Xmean.size()).type(float_type) + 0.001 * th.randn(self.Xmean.size()).type(float_type), requires_transform=True, ) else: # sequential data # temporal kernel for the GP from time t to latent variables X if isinstance(kernel_x, kernels.Kernel): assert kernel_x.input_dim == 1, ( "Currently only supports time input, i.e. kernel with " "one dimension input") self.kernel_x = kernel_x else: # TODO: kernel_x, better initialization needed! self.kernel_x = kernels.Rbf(1, variance=0.5, length_scales=0.5) self.kernel_x.variance.requires_grad = False # 1) vanilla, O(n^2*q) parameters for q(X) (not scalable) # ----- 2) Reparameterization (3.30) p58 in Damianou Diss. ------ current impl # 3) recognition model (will be the useful one) (RNN) # TODO: add the ability to handle multiple sequences (regressive) # 2) Reparameterization (3.30) p58 in Damianou Diss. # posterior mean of X initialized by PCA of Y: n x q # Xmean = th.Tensor(x_post_mean).type(float_type) # intermediate variables, useful for inference purpose, or other queries self.Xmean = Variable(Xmean) # init the cov matrix by using the kernel # timestamp is not required for stationary kernels Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # optimization parameters are mu_bar as in (3.30) self.Xmean_bar = Param(Kx.data.inverse().mm(Xmean)) # assume the posterior S is the same as the prior Kx # self.lambda_ = Param(th.zeros(Xmean.size()).type(float_type)) # assume the posterior S is close to the prior Kx # Constrain the Lambda to be positive, to ensure the S is PSD self.Lambda = Param(th.rand(Xmean.size()).type(float_type) * 0.25, requires_transform=True) # dummy initialization, n x q self.Xcov = Variable(th.ones(Xmean.size()).type(float_type) * 0.5) if inducing_points is not None: if isinstance(inducing_points, np.ndarray): assert (inducing_points.shape[0] == num_inducing and inducing_points.shape[1] == dim_latent ), "Dimensionality of inducing points does not match" self.Z = Param(th.from_numpy(inducing_points).type(float_type)) else: # inducing points Z, init with subset of posterior mean of X z_np = Xmean.data.numpy()[np.random.choice(Xmean.size(0), num_inducing, replace=False)] self.Z = Param(float_type(z_np)) # Uncollapsed case, the number of parameters associated with inducing points # variance is O(m^2) # TODO: stochasitic optimization with the uncollpased bound # MNIST data set 60k, 28 x 28 digits if not self.is_collapsed: # posterior mean of inducing variables U, init with subset of observations self.Umean = Param( th.from_numpy( self.Y[np.random.choice(self.Y.size(0), num_inducing, replace=False)]).type(float_type)) # posterior variance of inducing variables U: m x m # needs parameterization of cov matrix, e.g. Chol decomposition self.Ucov = Param(0.5 * th.ones(num_inducing, num_inducing), requires_transform=True) # self.jitter = Param(th.FloatTensor([1e-4]), requires_transform=True) self.jitter = Variable(th.Tensor([1e-6]).type(float_type)) # computes the total number of parameters to optimize over num_parameters = 0 for param in self.parameters(): num_parameters += param.data.numpy().size print("GPLVM: Number of optimization parameters is %d" % num_parameters)
class GPLVM(GPModel): """ Variational GPLVM Reference: Damianou, Andreas. Deep Gaussian processes and variational propagation of uncertainty. Diss. University of Sheffield, 2015. """ def __init__( self, observations, dim_latent, num_inducing, Xmean=None, inducing_points=None, kernel=None, kernel_x=None, data_type="iid", collapsed_bound=True, large_p=False, ): """ Initialization for the variational GPLVM Args: observations (np.ndarray): Observed data for unsupervised learning dim_latent (int): Dimensionality of the latent variables num_inducing (int): Number of inducing points Xmean (np.ndarray): Latent variable means (if None, will be init by PCA) inducing_points (np.ndarray): Inducing points, Z kernel (gptorch.Kernel): data_type (string): ``iid`` or ``seq`` (sequential) collapsed_bound (bool, optional): True for computing the ELBO when inducing variables are collapsed (second bound), False for the uncollapsed bound large_p (bool, optional): True for the case of small n, large p (HD video), False for the case of large p, small n. This option affects the computation of KL(q(X) || p(X)) """ warn("GPLVM is unstable and not recommended for use!") assert isinstance( observations, np.ndarray), "Observation matrix should be a np.ndarray." if Xmean is None: print("GPLVM: Initialize the Xmean using PCA") if large_p: pca_sklean = PCA(n_components=dim_latent) Xmean = pca_sklean.fit_transform(observations) else: Xmean = util.as_variable(util.PCA(observations, dim_latent)) else: assert isinstance(Xmean, np.ndarray), ( "Initialization of posterior mean of latent variables should" " be np.ndarray.") if kernel is None: kernel = ekernels.Rbf(dim_latent, ARD=True) else: assert ( dim_latent == kernel.input_dim ), "Input dimensionality of kernel must be equal to dim_latent." assert isinstance( kernel, ekernels.Rbf), "Supports only ekernel.Rbf currently." super(GPLVM, self).__init__(Xmean, observations, kernel, Gaussian(), Zero(), name="GPLVM") del self.X # flag to distinguish training and testing mode self.inference = False self.data_type = data_type self.is_collapsed = collapsed_bound self.is_large_p = large_p # Setup for test time inference # test data will be assigned in the projection method self.Y_test = None # latent variable mean and covariance for the test data self.Xmean_test = None self.Xcov_test = None # observed dimensions of the test data self.observed_dims = None self.saved_terms = {} if self.is_large_p: # saved for faster computation in the lower bound, n x n self.saved_terms["YYT"] = self.Y.mm(self.Y.t()) if self.data_type == "iid": # posterior mean of X initialized by PCA of Y: n x q # self.Xmean = Param(th.from_numpy(Xmean).type(float_type)) self.Xmean = Param(Xmean.data) # posterior covariance of X: n x q self.Xcov = Param( 0.5 * th.ones(self.Xmean.size()).type(float_type) + 0.001 * th.randn(self.Xmean.size()).type(float_type), requires_transform=True, ) else: # sequential data # temporal kernel for the GP from time t to latent variables X if isinstance(kernel_x, kernels.Kernel): assert kernel_x.input_dim == 1, ( "Currently only supports time input, i.e. kernel with " "one dimension input") self.kernel_x = kernel_x else: # TODO: kernel_x, better initialization needed! self.kernel_x = kernels.Rbf(1, variance=0.5, length_scales=0.5) self.kernel_x.variance.requires_grad = False # 1) vanilla, O(n^2*q) parameters for q(X) (not scalable) # ----- 2) Reparameterization (3.30) p58 in Damianou Diss. ------ current impl # 3) recognition model (will be the useful one) (RNN) # TODO: add the ability to handle multiple sequences (regressive) # 2) Reparameterization (3.30) p58 in Damianou Diss. # posterior mean of X initialized by PCA of Y: n x q # Xmean = th.Tensor(x_post_mean).type(float_type) # intermediate variables, useful for inference purpose, or other queries self.Xmean = Variable(Xmean) # init the cov matrix by using the kernel # timestamp is not required for stationary kernels Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # optimization parameters are mu_bar as in (3.30) self.Xmean_bar = Param(Kx.data.inverse().mm(Xmean)) # assume the posterior S is the same as the prior Kx # self.lambda_ = Param(th.zeros(Xmean.size()).type(float_type)) # assume the posterior S is close to the prior Kx # Constrain the Lambda to be positive, to ensure the S is PSD self.Lambda = Param(th.rand(Xmean.size()).type(float_type) * 0.25, requires_transform=True) # dummy initialization, n x q self.Xcov = Variable(th.ones(Xmean.size()).type(float_type) * 0.5) if inducing_points is not None: if isinstance(inducing_points, np.ndarray): assert (inducing_points.shape[0] == num_inducing and inducing_points.shape[1] == dim_latent ), "Dimensionality of inducing points does not match" self.Z = Param(th.from_numpy(inducing_points).type(float_type)) else: # inducing points Z, init with subset of posterior mean of X z_np = Xmean.data.numpy()[np.random.choice(Xmean.size(0), num_inducing, replace=False)] self.Z = Param(float_type(z_np)) # Uncollapsed case, the number of parameters associated with inducing points # variance is O(m^2) # TODO: stochasitic optimization with the uncollpased bound # MNIST data set 60k, 28 x 28 digits if not self.is_collapsed: # posterior mean of inducing variables U, init with subset of observations self.Umean = Param( th.from_numpy( self.Y[np.random.choice(self.Y.size(0), num_inducing, replace=False)]).type(float_type)) # posterior variance of inducing variables U: m x m # needs parameterization of cov matrix, e.g. Chol decomposition self.Ucov = Param(0.5 * th.ones(num_inducing, num_inducing), requires_transform=True) # self.jitter = Param(th.FloatTensor([1e-4]), requires_transform=True) self.jitter = Variable(th.Tensor([1e-6]).type(float_type)) # computes the total number of parameters to optimize over num_parameters = 0 for param in self.parameters(): num_parameters += param.data.numpy().size print("GPLVM: Number of optimization parameters is %d" % num_parameters) def log_likelihood(self): """ Computation graph for the ELBO (Evidence Lower Bound) of the variational GPLVM For the implementation details, please see ``notes/impl_gplvm``. """ num_data = self.Y.size(0) dim_output = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations eKxx = num_data * var_kernel if self.data_type == "iid": eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) else: # seq data # compute S_j's and mu_bar_j's (reparameterization: forward) # self.Xmean, self.Xcov = self._reparam_vargp(self.Xmean_bar, self.Lambda) Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # print(Kx.data.eig()) Lkx = cholesky(Kx, flag="Lkx") # Kx_inverse = inverse(Kx) self.Xmean = Kx.mm(self.Xmean_bar) Xcov = [] # S = [] Le = [] In = Variable(th.eye(num_data).type(float_type)) for j in xrange(dim_latent): Ej = Lkx.t().mm(self.Lambda.transform()[:, j].diag()).mm(Lkx) + In # print(Ej.data.eig()) Lej = cholesky(Ej, flag="Lej") Lsj = trtrs(Lej, Lkx.t()).t() Sj = Lsj.mm(Lsj.t()) Xcov.append(Sj.diag().unsqueeze(1)) # S.append(Sj) Le.append(Lej) self.Xcov = th.cat(Xcov, 1) eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov, False) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov, False) # compute ELBO # add jitter # broadcast update Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="Lkz") A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag="LB") # log|B| # log_det_b = LB.diag().log().sum() log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.trace()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(self.Y) D = trtrs(LB, trtrs(L, C)) elbo -= (0.5 * (self.Y.t().mm(self.Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace()) else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand( num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand( num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == "iid": const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) else: # seq data (3.29) p58 # Xmean n x q # S: q x n x n # Kx, Kx_inverse KLD = Variable( th.Tensor([-0.5 * num_data * dim_latent]).type(float_type)) KLD += 0.5 * self.Xmean_bar.mm(self.Xmean_bar.t()).mm( Kx.t()).trace() for j in xrange(dim_latent): Lej_inv = trtrs(Le[j], In) KLD += 0.5 * Lej_inv.t().mm(Lej_inv).trace() + Le[j].diag( ).log().sum() elbo -= KLD return elbo def log_likelihood_inference(self): """Computes the loss in the inference mode, e.g. for projection. Handles both fully observed and partially observed data. Only iid latent is implemented. """ num_data_train = self.Y.size(0) # dim_output_train = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) num_data_test = self.Y_test.size(0) # total number of data for inference num_data = num_data_train + num_data_test # dimension of output in the test time dim_output = self.Y_test.size(1) # whole data for inference if self.observed_dims is None: Y = th.cat((self.Y, self.Y_test), 0) else: Y = th.cat( (self.Y.index_select(1, self.observed_dims), self.Y_test), 0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations # eKxx = num_data * self.kernel.eKxx(self.Xmean).sum() eKxx = num_data * var_kernel if self.data_type == "iid": eKxz_test = self.kernel.eKxz(self.Z, self.Xmean_test, self.Xcov_test) eKzxKxz_test = self.kernel.eKzxKxz(self.Z, self.Xmean_test, self.Xcov_test) eKxz = th.cat((self.saved_terms["eKxz"], eKxz_test), 0) eKzxKxz = self.saved_terms["eKzxKxz"] + eKzxKxz_test else: print("regressive case not implemented") # compute ELBO L = self.saved_terms["L"] A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag="LB") log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.diag().sum()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(Y) D = trtrs(LB, trtrs(L, C)) elbo -= (0.5 * (Y.t().mm(Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace()) else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand( num_data, num_data) - D.t().mm(D) / var_noise.pow(2).expand( num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms["YYT"])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == "iid": const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) elbo -= KLD return elbo def loss(self): if not self.inference: return super().loss() else: return -(self.log_likelihood_inference() + self.log_prior()) def _pre_compute(self): """Pre-computation for the projection Fixed terms in test time are manually identified, Only iid latent is implemented. """ # Save the fixed terms here # self.saved_terms = {} if self.observed_dims is not None: # select observed dims to compute Y = th.cat( (self.Y.index_select(1, self.observed_dims), self.Y_test), 0) self.saved_terms["YYT"] = Y.mm(Y.t()) # computes kernel expectations if self.data_type == "iid": eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) self.saved_terms["eKxz"] = eKxz self.saved_terms["eKzxKxz"] = eKzxKxz else: print("regressive case, not implemented") Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="L") self.saved_terms["L"] = L def project(self, observ_test, observed_dims=None): """Infers the latent input corresponding to the new observed data The test data can be partially observed. # TODO: Currently only the Gaussian approximations, and iid case. With recognition model, inference of latent would be faster. Args: observ_test (numpy.ndarray): Test observed data observed_dims (list or np.array, 1D): Observed dimensions of the partially observed test data. Must be provided for partially observed test case. Returns: mean and variance of the posterior of Gaussian approximations """ # Set the modes to be the inference mode self.inference = True if observed_dims is None: # Fully observable data assert observ_test.shape[1] == self.Y.size(1), ( "Test data dimension must equal to that of the training " "data for the fully observed case, otherwise please " "specify the observed dimensions using ``observed_dims``") else: assert isinstance(observed_dims, (basestring, np.ndarray)), ( "Type of the list of observed dimensions should be list " "or 1d np.array") self.observed_dims = Variable(th.LongTensor(observed_dims)) assert isinstance(observ_test, np.ndarray), "Test data should be " "np.ndarray" if observ_test.ndim == 1: observ_test = observ_test[None, :] # Design choice: do not create a tiny inference model, but reuse the # trained model using another function for compute the loss. # Add new observation variables to the original class self.Y_test = Variable(th.Tensor(observ_test).type(float_type)) # Freeze the trained parameters for param in self.parameters(): param.requires_grad = False # initialize Xmean_test, Xcov_test by searching for the nearest # neighbour in the data space if observed_dims is None: Y_observed = self.Y else: Y_observed = self.Y.index_select(1, self.observed_dims) YYT = self.Y_test.mm(Y_observed.t()) dist_matrix = (-2 * YYT + self.Y_test.pow(2).sum(1).expand_as(YYT) + Y_observed.t().pow(2).sum(0).expand_as(YYT)) _, argmin = dist_matrix.min(1) argmin = argmin.view(self.Y_test.size(0)).data self.Xmean_test = Param(self.Xmean.data[argmin]) self.Xcov_test = Param(self.Xcov.transform().data[argmin], requires_transform=True) print("GPLVM: Finish preparing the model for projection") self._pre_compute() print("GPLVM: Done with pre-computation. \nPlease optimize the model" " again to obtain the projected latent variables\n") # optimize the latent variables # Q: how to know the optimization converges? this is slow and painful # Thus the model is returned to user for optimization # model_project.optimize(method='LBFGS', max_iter=100, verbose=False) # return model_project.Xmean, model_project.Xcov # self.optimize(method='LBFGS', max_iter=100, verbose=True) # use the ``compute_loss_inference`` method during the optimization # return self.Xmean_test, self.Xcov_test def _predict(self, Xnew_mean, Xnew_var=None, diag=True): """Computes the mean and variance of latent function output corresponding to the new (uncertain) input The new input can be deterministic or uncertain (only Gaussian: mean and variance). Returns the predictions over all dimensions (extract the needed dimensions for imputation case after getting the returns) Args: Xnew_mean (np.ndarray): new latent input, it is the deterministic input if ``input_var`` is None, otherwise it is the mean of the latent posterior, size n_* x q Xnew_var (np.ndarray): variance (covariance) of latent posterior, iid case, still n_* x q (each row stores the diagonal of cov) Returns: (Variables): n_* x p, mean of the predicted latent output (Variables): covariance of the predicted latent output, n_* x p for the deterministic case (share the same covariance), or n_* x q x q for the uncertain Gaussian input, iid. """ assert isinstance( Xnew_mean, np.ndarray) and Xnew_mean.shape[1] == self.Xmean.size(1), ( "Input_mean should be numpy.ndarary, and its column dims " "should be same as the latent dimensions") Xnew_mean = Variable(th.Tensor(Xnew_mean).type(float_type), volatile=True) num_inducing = self.Z.size(0) beta = 1.0 / self.likelihood.variance.transform() # Psi1, Psi2 eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) Kzs = self.kernel.K(self.Z, Xnew_mean) Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag="Lkz") A = trtrs(L, trtrs(L, eKzxKxz).t()) * beta.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) Lb = cholesky(B, flag="Lb") C = trtrs(L, Kzs) D = trtrs(Lb, C) if Xnew_var is None: # broadcast udpated mean = D.t().mm(trtrs(Lb, trtrs( L, eKxz.t().mm(self.Y)))) * beta.expand(Xnew_mean.size(0), self.Y.size(1)) # return full covariance or only the diagonal if diag: # 1d tensor var = (self.kernel.Kdiag(Xnew_mean) - C.pow(2).sum(0).squeeze() + D.pow(2).sum(0).squeeze()) else: var = self.kernel.K(Xnew_mean) - C.t().mm(C) + D.t().mm(D) else: # uncertain input, assume Gaussian. assert (isinstance(Xnew_var, np.ndarray) and Xnew_var.shape == Xnew_var.shape), ( "Uncertain input, inconsistent variance size, " "should be numpy ndarray") Xnew_var = Param(th.Tensor(Xnew_var).type(float_type)) Xnew_var.requires_transform = True Xnew_var.volatile = True # s for star (new input), z for inducing input eKsz = self.kernel.eKxz(self.Z, Xnew_mean, Xnew_var) # list of n_* expectations w.r.t. each test datum eKzsKsz = self.kernel.eKzxKxz(self.Z, Xnew_mean, Xnew_var, sum=False) Im = Variable(th.eye(self.Z.size(0)).type(float_type)) E = trtrs(Lb, trtrs(L, Im)) EtE = E.t().mm(E) F = EtE.mm(eKxz.t().mm(self.Y)) * beta.expand( self.Z.size(0), self.Y.size(1)) mean = eKsz.mm(F) Linv = trtrs(L, Im) Sigma = Linv.t().mm(Linv) - EtE # n x m x m # eKzsKsz = eKzsKsz.cat(0).view(Xnew_mean.size(0), *self.Z.size()) var = [] if diag: ns = Xnew_mean.size(0) p = self.Y.size(1) # vectorization? for i in range(ns): cov = (self.kernel.variance.transform() - Sigma.mm( eKzsKsz[i]).trace()).expand( p, p) + F.t().mm(eKzsKsz[i] - eKsz[i, :].unsqueeze( 0).t().mm(eKsz[i, :].unsqueeze(0))).mm(F) var.append(cov) else: # full covariance case, leave for future print("multi-output case, future feature") var = None pass return mean, var def generate(self, num_samples): """Generate new samples from the generative model Gaussian mixture model is a good choice for the iid latent. .. Note:: Enforce the posterior of latents to approach the specified prior of latents, then samples from the prior, propagates through the model.This is the method used in VAEs. But the samples are not that good, visually (MNIST). .. Note:: Two ways of drawing samples are different: 1. Drawing one sample at a time and repeat multiple times ('random') 2. Drawing multiple samples at a time (smooth) """ # generate new samples from the posterior distributions def reconstruct(self, observed_part, observed_dims): """Reconstruct the missing dimensions in the test data Args: observed_part (np.ndarray): Partially observed test data observed_dims (slice): indices for the observed dimensions Returns: missing means and variances of test data """ # 1. optimize q(X_*) - similar to projection self.project(observed_part, observed_dims) self.optimize(method="LBFGS", max_iter=100) # 2. generation / predict mean, var = self._predict(self.Xmean_test, self.Xcov_test) missing_dims = th.LongTensor( np.setdiff1d(range(self.Y.size(1)), self.observed_dims)) return mean[:, missing_dims], var[:, missing_dims] def _forecast(self, time_interval): pass
class GPLVM(GPModel): """ Variational GPLVM Reference: Damianou, Andreas. Deep Gaussian processes and variational propagation of uncertainty. Diss. University of Sheffield, 2015. """ def __init__(self, observations, dim_latent, num_inducing, Xmean=None, inducing_points=None, kernel=None, kernel_x=None, data_type='iid', collapsed_bound=True, large_p=False): """ Initialization for the variational GPLVM Args: observations (np.ndarray): Observed data for unsupervised learning dim_latent (int): Dimensionality of the latent variables num_inducing (int): Number of inducing points Xmean (np.ndarray): Latent variable means (if None, will be init by PCA) inducing_points (np.ndarray): Inducing points, Z kernel (gptorch.Kernel): data_type (string): ``iid`` or ``seq`` (sequential) collapsed_bound (bool, optional): True for computing the ELBO when inducing variables are collapsed (second bound), False for the uncollapsed bound large_p (bool, optional): True for the case of small n, large p (HD video), False for the case of large p, small n. This option affects the computation of KL(q(X) || p(X)) """ assert isinstance(observations, np.ndarray), \ "Observation matrix should be a np.ndarray." if Xmean is None: print("GPLVM: Initialize the Xmean using PCA") if large_p: pca_sklean = PCA(n_components=dim_latent) Xmean = pca_sklean.fit_transform(observations) else: Xmean = util.as_variable(util.PCA(observations, dim_latent)) else: assert isinstance(Xmean, np.ndarray), \ "Initialization of posterior mean of latent variables should" \ " be np.ndarray." if kernel is None: kernel = ekernels.Rbf(dim_latent, ARD=True) else: assert dim_latent == kernel.input_dim, \ "Input dimensionality of kernel must be equal to dim_latent." assert isinstance(kernel, ekernels.Rbf), \ "Supports only ekernel.Rbf currently." super(GPLVM, self).__init__(observations, Xmean, kernel, Gaussian(), Zero(), name='GPLVM') del self.X # flag to distinguish training and testing mode self.inference = False self.data_type = data_type self.is_collapsed = collapsed_bound self.is_large_p = large_p # Setup for test time inference # test data will be assigned in the projection method self.Y_test = None # latent variable mean and covariance for the test data self.Xmean_test = None self.Xcov_test = None # observed dimensions of the test data self.observed_dims = None self.saved_terms = {} if self.is_large_p: # saved for faster computation in the lower bound, n x n self.saved_terms['YYT'] = self.Y.mm(self.Y.t()) if self.data_type == 'iid': # posterior mean of X initialized by PCA of Y: n x q # self.Xmean = Param(th.from_numpy(Xmean).type(float_type)) self.Xmean = Param(Xmean.data) # posterior covariance of X: n x q self.Xcov = Param( 0.5 * th.ones(self.Xmean.size()).type(float_type) + 0.001 * th.randn(self.Xmean.size()).type(float_type), requires_transform=True) else: # sequential data # temporal kernel for the GP from time t to latent variables X if isinstance(kernel_x, kernels.Kernel): assert kernel_x.input_dim == 1, \ "Currently only supports time input, i.e. kernel with " \ "one dimension input" self.kernel_x = kernel_x else: # TODO: kernel_x, better initialization needed! self.kernel_x = kernels.Rbf(1, variance=0.5, length_scales=0.5) self.kernel_x.variance.requires_grad = False # 1) vanilla, O(n^2*q) parameters for q(X) (not scalable) # ----- 2) Reparameterization (3.30) p58 in Damianou Diss. ------ current impl # 3) recognition model (will be the useful one) (RNN) # TODO: add the ability to handle multiple sequences (regressive) # 2) Reparameterization (3.30) p58 in Damianou Diss. # posterior mean of X initialized by PCA of Y: n x q # Xmean = th.Tensor(x_post_mean).type(float_type) # intermediate variables, useful for inference purpose, or other queries self.Xmean = Variable(Xmean) # init the cov matrix by using the kernel # timestamp is not required for stationary kernels Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # optimization parameters are mu_bar as in (3.30) self.Xmean_bar = Param(Kx.data.inverse().mm(Xmean)) # assume the posterior S is the same as the prior Kx # self.lambda_ = Param(th.zeros(Xmean.size()).type(float_type)) # assume the posterior S is close to the prior Kx # Constrain the Lambda to be positive, to ensure the S is PSD self.Lambda = Param(th.rand(Xmean.size()).type(float_type) * 0.25, requires_transform=True) # dummy initialization, n x q self.Xcov = Variable(th.ones(Xmean.size()).type(float_type) * 0.5) if inducing_points is not None: if isinstance(inducing_points, np.ndarray): assert inducing_points.shape[0] == num_inducing and \ inducing_points.shape[1] == dim_latent, \ "Dimensionality of inducing points does not match" self.Z = Param(th.from_numpy(inducing_points).type(float_type)) else: # inducing points Z, init with subset of posterior mean of X z_np = Xmean.data.numpy()[np.random.choice(Xmean.size(0), num_inducing, replace=False)] self.Z = Param(float_type(z_np)) # Uncollapsed case, the number of parameters associated with inducing points # variance is O(m^2) # TODO: stochasitic optimization with the uncollpased bound # MNIST data set 60k, 28 x 28 digits if not self.is_collapsed: # posterior mean of inducing variables U, init with subset of observations self.Umean = Param( th.from_numpy( self.Y[np.random.choice(self.Y.size(0), num_inducing, replace=False)]).type(float_type)) # posterior variance of inducing variables U: m x m # needs parameterization of cov matrix, e.g. Chol decomposition self.Ucov = Param(0.5 * th.ones(num_inducing, num_inducing), requires_transform=True) # self.jitter = Param(th.FloatTensor([1e-4]), requires_transform=True) self.jitter = Variable(th.Tensor([1e-6]).type(float_type)) # computes the total number of parameters to optimize over num_parameters = 0 for param in self.parameters(): num_parameters += param.data.numpy().size print('GPLVM: Number of optimization parameters is %d' % num_parameters) def compute_loss(self): """ Computation graph for the ELBO (Evidence Lower Bound) of the variational GPLVM For the implementation details, please see ``notes/impl_gplvm``. """ num_data = self.Y.size(0) dim_output = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations eKxx = num_data * var_kernel if self.data_type == "iid": eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) else: # seq data # compute S_j's and mu_bar_j's (reparameterization: forward) # self.Xmean, self.Xcov = self._reparam_vargp(self.Xmean_bar, self.Lambda) Kx = self.kernel_x.K(np.array(xrange(self.Y.size(0)))[:, None]) # print(Kx.data.eig()) Lkx = cholesky(Kx, flag='Lkx') # Kx_inverse = inverse(Kx) self.Xmean = Kx.mm(self.Xmean_bar) Xcov = [] # S = [] Le = [] In = Variable(th.eye(num_data).type(float_type)) for j in xrange(dim_latent): Ej = Lkx.t().mm(self.Lambda.transform()[:, j].diag()).mm(Lkx) + In # print(Ej.data.eig()) Lej = cholesky(Ej, flag='Lej') Lsj = trtrs(Lej, Lkx.t()).t() Sj = Lsj.mm(Lsj.t()) Xcov.append(Sj.diag().unsqueeze(1)) # S.append(Sj) Le.append(Lej) self.Xcov = th.cat(Xcov, 1) eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov, False) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov, False) # compute ELBO # add jitter # broadcast update Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag='Lkz') A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag='LB') # log|B| # log_det_b = LB.diag().log().sum() log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.trace()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(self.Y) D = trtrs(LB, trtrs(L, C)) elbo -= 0.5 * ( self.Y.t().mm(self.Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace() else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) \ / var_noise.expand(num_data, num_data) \ - D.t().mm(D) / var_noise.pow(2).expand(num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms['YYT'])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == 'iid': const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) else: # seq data (3.29) p58 # Xmean n x q # S: q x n x n # Kx, Kx_inverse KLD = Variable( th.Tensor([-0.5 * num_data * dim_latent]).type(float_type)) KLD += 0.5 * self.Xmean_bar.mm(self.Xmean_bar.t()).mm( Kx.t()).trace() for j in xrange(dim_latent): Lej_inv = trtrs(Le[j], In) KLD += 0.5 * Lej_inv.t().mm(Lej_inv).trace() + Le[j].diag( ).log().sum() elbo -= KLD return -elbo def _pre_compute(self): """Pre-computation for the projection Fixed terms in test time are manually identified, Only iid latent is implemented. """ # Save the fixed terms here # self.saved_terms = {} if self.observed_dims is not None: # select observed dims to compute Y = th.cat( (self.Y.index_select(1, self.observed_dims), self.Y_test), 0) self.saved_terms['YYT'] = Y.mm(Y.t()) # computes kernel expectations if self.data_type == "iid": eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) self.saved_terms['eKxz'] = eKxz self.saved_terms['eKzxKxz'] = eKzxKxz else: print("regressive case, not implemented") Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag='L') self.saved_terms['L'] = L def _compute_loss_inference(self): """Computes the loss in the inference mode, e.g. for projection. Handles both fully observed and partially observed data. Only iid latent is implemented. """ num_data_train = self.Y.size(0) # dim_output_train = self.Y.size(1) dim_latent = self.Z.size(1) num_inducing = self.Z.size(0) num_data_test = self.Y_test.size(0) # total number of data for inference num_data = num_data_train + num_data_test # dimension of output in the test time dim_output = self.Y_test.size(1) # whole data for inference if self.observed_dims is None: Y = th.cat((self.Y, self.Y_test), 0) else: Y = th.cat( (self.Y.index_select(1, self.observed_dims), self.Y_test), 0) var_kernel = self.kernel.variance.transform() var_noise = self.likelihood.variance.transform() # computes kernel expectations # eKxx = num_data * self.kernel.eKxx(self.Xmean).sum() eKxx = num_data * var_kernel if self.data_type == "iid": eKxz_test = self.kernel.eKxz(self.Z, self.Xmean_test, self.Xcov_test) eKzxKxz_test = self.kernel.eKzxKxz(self.Z, self.Xmean_test, self.Xcov_test) eKxz = th.cat((self.saved_terms['eKxz'], eKxz_test), 0) eKzxKxz = self.saved_terms['eKzxKxz'] + eKzxKxz_test else: print("regressive case not implemented") # compute ELBO L = self.saved_terms['L'] A = trtrs(L, trtrs(L, eKzxKxz).t()) / var_noise.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) LB = cholesky(B, flag='LB') log_2pi = Variable(th.Tensor([np.log(2 * np.pi)]).type(float_type)) elbo = -dim_output * (LB.diag().log().sum() + 0.5 * num_data * (var_noise.log() + log_2pi)) elbo -= 0.5 * dim_output * (eKxx / var_noise - A.diag().sum()) if not self.is_large_p: # distributed # C = Variable(th.zeros(num_inducing, dim_output)) # for i in xrange(num_data): # C += Psi[i, :].unsqueeze(1).mm(self.Y[i, :].unsqueeze(0)) C = eKxz.t().mm(Y) D = trtrs(LB, trtrs(L, C)) elbo -= 0.5 * ( Y.t().mm(Y) / var_noise.expand(dim_output, dim_output) - D.t().mm(D) / var_noise.pow(2).expand(dim_output, dim_output)).trace() else: # small n, pre-compute YY' # YYT = self.Y.mm(self.Y.t()) D = trtrs(LB, trtrs(L, eKxz.t())) W = Variable(th.eye(num_data).type(float_type)) / var_noise.expand( num_data, num_data) - \ D.t().mm(D) / var_noise.pow(2).expand(num_data, num_data) elbo -= 0.5 * (W.mm(self.saved_terms['YYT'])).trace() # KL Divergence (KLD) btw the posterior and the prior if self.data_type == 'iid': const_nq = Variable( th.Tensor([num_data * dim_latent]).type(float_type)) # eqn (3.28) below p57 Damianou's Diss. KLD = 0.5 * (self.Xmean.pow(2).sum() + self.Xcov.transform().sum() - self.Xcov.transform().log().sum() - const_nq) elbo -= KLD return -elbo def optimize(self, method='LBFGS', max_iter=2000, verbose=True, lr=0.01): """ Optimizes the model by minimizing the loss (from :method:) w.r.t. model parameters. Args: method (torch.optim.Optimizer, optional): Optimizer in PyTorch (maybe add scipy optimizer in the future), default is `Adam`. max_iter (int): Max iterations, default 2000. verbose (bool, optional): Shows more details on optimization process if True. Todo: Add stochastic optimization, such as mini-batch. Returns: (np.array, value): losses: losses over optimization steps, (max_iter, ) time: time taken approximately """ parameters = ifilter(lambda p: p.requires_grad, self.parameters()) if method == 'SGD': self.optimizer = th.optim.SGD(parameters, lr=0.05, momentum=0.9) elif method == 'Adam': self.optimizer = th.optim.Adam(parameters, lr=lr, betas=(0.9, 0.999), eps=1e-06, weight_decay=0.00001) elif method == 'LBFGS': self.optimizer = th.optim.LBFGS(parameters, lr=1, max_iter=5, max_eval=None, tolerance_grad=1e-05, tolerance_change=1e-09, history_size=50, line_search_fn=None) elif method == 'Adadelta': self.optimizer = th.optim.Adadelta(parameters, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0.00001) elif method == 'Adagrad': self.optimizer = th.optim.Adagrad(parameters, lr=0.01, lr_decay=0, weight_decay=0) elif method == 'Adamax': self.optimizer = th.optim.Adamax(parameters, lr=0.002, betas=(0.9, 0.999), eps=1e-08, weight_decay=0) elif method == 'ASGD': self.optimizer = th.optim.ASGD(parameters, lr=0.01, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0) elif method == 'RMSprop': self.optimizer = th.optim.RMSprop(parameters, lr=lr, alpha=0.99, eps=1e-08, weight_decay=0.00, momentum=0.01, centered=False) elif method == 'Rprop': self.optimizer = th.optim.Rprop(parameters, lr=0.01, etas=(0.5, 1.2), step_sizes=(1e-06, 50)) # scipy.optimize.minimize # suggest to use L-BFGS-B, BFGS elif method in [ 'CG', 'BFGS', 'Newton-CG', 'Nelder-Mead', 'Powell', 'L-BFGS-B', 'TNC', 'COBYLA', 'SLSQP', 'dogleg', 'trust-ncg' ]: print('Scipy.optimize.minimize...') return self._optimize_scipy(method=method, maxiter=max_iter, disp=verbose) else: raise Exception( 'Optimizer %s is not found. Please choose one of the' 'following optimizers supported in PyTorch:' 'Adadelt, Adagrad, Adam, Adamax, ASGD, LBFGS, ' 'RMSprop, Rprop, SGD, LBFGS. Or the optimizers ' 'supported scipy.optimize.minminze: BFGS, L-BFGS-B,' 'CG, Newton-CG, Nelder-Mead, Powell, TNC, COBYLA,' 'SLSQP, dogleg, trust-ncg, etc.' % method) losses = np.zeros(max_iter) tic = time() print('{}: Start optimization via {} in the {} mode'.format( self.__class__.__name__, method, 'inference' if self.inference else 'training')) if not self.inference: compute_loss = self.compute_loss else: compute_loss = self._compute_loss_inference if verbose: if not method == 'LBFGS': for iter in range(max_iter): self.optimizer.zero_grad() # forward loss = compute_loss() # backward loss.backward() self.optimizer.step() losses[iter] = loss.data.numpy() print('Iter: %d\tLoss: %s' % (iter, loss.data.numpy())) else: for iter in range(max_iter): def closure(): self.optimizer.zero_grad() loss = compute_loss() loss.backward() return loss loss = self.optimizer.step(closure) losses[iter] = loss.data.numpy() print('Iter: %d\tLoss: %s' % (iter, loss.data.numpy())) else: if not method == 'LBFGS': for iter in range(max_iter): self.optimizer.zero_grad() # forward loss = compute_loss() # backward loss.backward() self.optimizer.step() losses[iter] = loss.data.numpy() if iter % 10 == 0: print('Iter: %d\tLoss: %s' % (iter, loss.data.numpy())) else: for iter in range(max_iter): def closure(): self.optimizer.zero_grad() loss = compute_loss() loss.backward() return loss loss = self.optimizer.step(closure) losses[iter] = loss.data.numpy() if iter % 10 == 0: print('Iter: %d\tLoss: %s' % (iter, loss.data.numpy())) t = time() - tic print('Optimization time taken: %s s' % t) print('Optimization method: %s' % str(self.optimizer)) if len(losses) == max_iter: print( 'Optimization terminated by reaching the maximum iterations\n') else: print( 'Optimization terminated by getting below the tolerant error\n' ) return losses, t def project(self, observ_test, observed_dims=None): """Infers the latent input corresponding to the new observed data The test data can be partially observed. # TODO: Currently only the Gaussian approximations, and iid case. With recognition model, inference of latent would be faster. Args: observ_test (numpy.ndarray): Test observed data observed_dims (list or np.array, 1D): Observed dimensions of the partially observed test data. Must be provided for partially observed test case. Returns: mean and variance of the posterior of Gaussian approximations """ # Set the modes to be the inference mode self.inference = True if observed_dims is None: # Fully observable data assert observ_test.shape[1] == self.Y.size(1), \ "Test data dimension must equal to that of the training " \ "data for the fully observed case, otherwise please " \ "specify the observed dimensions using ``observed_dims``" else: assert isinstance(observed_dims, (basestring, np.ndarray)), \ "Type of the list of observed dimensions should be list " \ "or 1d np.array" self.observed_dims = Variable(th.LongTensor(observed_dims)) assert isinstance(observ_test, np.ndarray), "Test data should be " \ "np.ndarray" if observ_test.ndim == 1: observ_test = observ_test[None, :] # Design choice: do not create a tiny inference model, but reuse the # trained model using another function for compute the loss. # Add new observation variables to the original class self.Y_test = Variable(th.Tensor(observ_test).type(float_type)) # Freeze the trained parameters for param in self.parameters(): param.requires_grad = False # initialize Xmean_test, Xcov_test by searching for the nearest # neighbour in the data space if observed_dims is None: Y_observed = self.Y else: Y_observed = self.Y.index_select(1, self.observed_dims) YYT = self.Y_test.mm(Y_observed.t()) dist_matrix = -2 * YYT + self.Y_test.pow(2).sum(1).expand_as(YYT) + \ Y_observed.t().pow(2).sum(0).expand_as(YYT) _, argmin = dist_matrix.min(1) argmin = argmin.view(self.Y_test.size(0)).data self.Xmean_test = Param(self.Xmean.data[argmin]) self.Xcov_test = Param(self.Xcov.transform().data[argmin], requires_transform=True) print("GPLVM: Finish preparing the model for projection") self._pre_compute() print("GPLVM: Done with pre-computation. \nPlease optimize the model" " again to obtain the projected latent variables\n") # optimize the latent variables # Q: how to know the optimization converges? this is slow and painful # Thus the model is returned to user for optimization # model_project.optimize(method='LBFGS', max_iter=100, verbose=False) # return model_project.Xmean, model_project.Xcov # self.optimize(method='LBFGS', max_iter=100, verbose=True) # use the ``compute_loss_inference`` method during the optimization # return self.Xmean_test, self.Xcov_test def _predict(self, Xnew_mean, Xnew_var=None, diag=True): """Computes the mean and variance of latent function output corresponding to the new (uncertain) input The new input can be deterministic or uncertain (only Gaussian: mean and variance). Returns the predictions over all dimensions (extract the needed dimensions for imputation case after getting the returns) Args: Xnew_mean (np.ndarray): new latent input, it is the deterministic input if ``input_var`` is None, otherwise it is the mean of the latent posterior, size n_* x q Xnew_var (np.ndarray): variance (covariance) of latent posterior, iid case, still n_* x q (each row stores the diagonal of cov) Returns: (Variables): n_* x p, mean of the predicted latent output (Variables): covariance of the predicted latent output, n_* x p for the deterministic case (share the same covariance), or n_* x q x q for the uncertain Gaussian input, iid. """ assert isinstance(Xnew_mean, np.ndarray) and \ Xnew_mean.shape[1] == self.Xmean.size(1), \ "Input_mean should be numpy.ndarary, and its column dims " \ "should be same as the latent dimensions" Xnew_mean = Variable(th.Tensor(Xnew_mean).type(float_type), volatile=True) num_inducing = self.Z.size(0) beta = 1. / self.likelihood.variance.transform() # Psi1, Psi2 eKxz = self.kernel.eKxz(self.Z, self.Xmean, self.Xcov) eKzxKxz = self.kernel.eKzxKxz(self.Z, self.Xmean, self.Xcov) Kzs = self.kernel.K(self.Z, Xnew_mean) Kzz = self.kernel.K(self.Z) + self.jitter.expand(self.Z.size(0)).diag() L = cholesky(Kzz, flag='Lkz') A = trtrs(L, trtrs(L, eKzxKxz).t()) * beta.expand_as(L) B = A + Variable(th.eye(num_inducing).type(float_type)) Lb = cholesky(B, flag='Lb') C = trtrs(L, Kzs) D = trtrs(Lb, C) if Xnew_var is None: # broadcast udpated mean = D.t().mm(trtrs(Lb, trtrs(L, eKxz.t().mm(self.Y)))) \ * beta.expand(Xnew_mean.size(0), self.Y.size(1)) # return full covariance or only the diagonal if diag: # 1d tensor var = self.kernel.Kdiag(Xnew_mean) - C.pow(2).sum(0).squeeze() \ + D.pow(2).sum(0).squeeze() else: var = self.kernel.K(Xnew_mean) - C.t().mm(C) + D.t().mm(D) else: # uncertain input, assume Gaussian. assert isinstance(Xnew_var, np.ndarray) and \ Xnew_var.shape == Xnew_var.shape, \ "Uncertain input, inconsistent variance size, " \ "should be numpy ndarray" Xnew_var = Param(th.Tensor(Xnew_var).type(float_type)) Xnew_var.requires_transform = True Xnew_var.volatile = True # s for star (new input), z for inducing input eKsz = self.kernel.eKxz(self.Z, Xnew_mean, Xnew_var) # list of n_* expectations w.r.t. each test datum eKzsKsz = self.kernel.eKzxKxz(self.Z, Xnew_mean, Xnew_var, sum=False) Im = Variable(th.eye(self.Z.size(0)).type(float_type)) E = trtrs(Lb, trtrs(L, Im)) EtE = E.t().mm(E) F = EtE.mm(eKxz.t().mm(self.Y)) \ * beta.expand(self.Z.size(0), self.Y.size(1)) mean = eKsz.mm(F) Linv = trtrs(L, Im) Sigma = Linv.t().mm(Linv) - EtE # n x m x m # eKzsKsz = eKzsKsz.cat(0).view(Xnew_mean.size(0), *self.Z.size()) var = [] if diag: ns = Xnew_mean.size(0) p = self.Y.size(1) # vectorization? for i in range(ns): cov = (self.kernel.variance.transform() - Sigma.mm(eKzsKsz[i]).trace()).expand(p, p) + \ F.t().mm(eKzsKsz[i] - eKsz[i, :].unsqueeze(0).t(). mm(eKsz[i, :].unsqueeze(0))).mm(F) var.append(cov) else: # full covariance case, leave for future print("multi-output case, future feature") var = None pass return mean, var def generate(self, num_samples): """Generate new samples from the generative model Gaussian mixture model is a good choice for the iid latent. .. Note:: Enforce the posterior of latents to approach the specified prior of latents, then samples from the prior, propagates through the model.This is the method used in VAEs. But the samples are not that good, visually (MNIST). .. Note:: Two ways of drawing samples are different: 1. Drawing one sample at a time and repeat multiple times ('random') 2. Drawing multiple samples at a time (smooth) """ # generate new samples from the posterior distributions def reconstruct(self, observed_part, observed_dims): """Reconstruct the missing dimensions in the test data Args: observed_part (np.ndarray): Partially observed test data observed_dims (slice): indices for the observed dimensions Returns: missing means and variances of test data """ # 1. optimize q(X_*) - similar to projection self.project(observed_part, observed_dims) self.optimize(method='LBFGS', max_iter=100) # 2. generation / predict mean, var = self._predict(self.Xmean_test, self.Xcov_test) missing_dims = th.LongTensor( np.setdiff1d(range(self.Y.size(1)), self.observed_dims)) return mean[:, missing_dims], var[:, missing_dims] def _forecast(self, time_interval): pass