def gauss_kl(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, self.Y.shape[1]]) return fmean, fvar
def compute_upper_bound(self): num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) Kdiag = self.kern.Kdiag(self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kuf = self.feature.Kuf(self.kern, self.X) L = tf.cholesky(Kuu) LB = tf.cholesky(Kuu + self.likelihood.variance ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True) # Using the Trace bound, from Titsias' presentation c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf ** 2.0) # Kff = self.kern.K(self.X) # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True) # Alternative bound on max eigenval: # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0)) corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance) logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum(tf.log(tf.diag_part(LB))) LC = tf.cholesky(Kuu + corrected_noise ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) v = tf.matrix_triangular_solve(LC, corrected_noise ** -1.0 * tf.matmul(Kuf, self.Y), lower=True) quad = -0.5 * corrected_noise ** -1.0 * tf.reduce_sum(self.Y ** 2.0) + 0.5 * tf.reduce_sum(v ** 2.0) return const + logdet + quad
def build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. """ _, _, Luu, L, _, _, gamma = self.build_common_terms() Kus = self.kern.K(self.Z, Xnew) # size M x Xnew w = tf.matrix_triangular_solve(Luu, Kus, lower=True) # size M x Xnew tmp = tf.matrix_triangular_solve(tf.transpose(L), gamma, lower=False) mean = tf.matmul(tf.transpose(w), tmp) + self.mean_function(Xnew) intermediateA = tf.matrix_triangular_solve(L, w, lower=True) if full_cov: var = ( self.kern.K(Xnew) - tf.matmul(tf.transpose(w), w) + tf.matmul(tf.transpose(intermediateA), intermediateA) ) var = tf.tile(tf.expand_dims(var, 2), tf.pack([1, 1, tf.shape(self.Y)[1]])) else: var = ( self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(w), 0) + tf.reduce_sum(tf.square(intermediateA), 0) ) # size Xnew, var = tf.tile(tf.expand_dims(var, 1), tf.pack([1, tf.shape(self.Y)[1]])) return mean, var
def test_whiten(self): """ make sure that predicting using the whitened representation is the sameas the non-whitened one. """ with self.test_context() as sess: rng = np.random.RandomState(0) Xs, X, F, k, num_data, feed_dict = self.prepare() k.compile(session=sess) F_sqrt = tf.placeholder(settings.float_type, [num_data, 1]) F_sqrt_data = rng.rand(num_data, 1) feed_dict[F_sqrt] = F_sqrt_data K = k.K(X) L = tf.cholesky(K) V = tf.matrix_triangular_solve(L, F, lower=True) V_sqrt = tf.matrix_triangular_solve(L, tf.diag(F_sqrt[:, 0]), lower=True)[None, :, :] Fstar_mean, Fstar_var = gpflow.conditionals.conditional( Xs, X, k, F, q_sqrt=F_sqrt) Fstar_w_mean, Fstar_w_var = gpflow.conditionals.conditional( Xs, X, k, V, q_sqrt=V_sqrt, white=True) mean_difference = sess.run(Fstar_w_mean - Fstar_mean, feed_dict=feed_dict) var_difference = sess.run(Fstar_w_var - Fstar_var, feed_dict=feed_dict) assert_allclose(mean_difference, 0, atol=4) assert_allclose(var_difference, 0, atol=4)
def build_predict(self, Xnew , full_cov=False): err = self.Y Kuf = self.RBF(self.Z, self.X) Kuu = self.RBF(self.Z,self.Z) + eye(self.num_inducing) * 1e-6 Kus = self.RBF(self.Z, Xnew) sigma = tf.sqrt(self.likelihood_variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, tf.transpose(A)) + eye(num_inducing) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.RBF(Xnew, Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.RBF(Xnew, Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean , var
def build_likelihood(self): """ Constuct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] num_data = tf.shape(self.Y)[0] output_dim = tf.shape(self.Y)[1] err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True)*tf.sqrt(1./self.likelihood.variance) AAT = tf.matmul(A, tf.transpose(A)) B = AAT + eye(num_inducing) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, err), lower=True) * tf.sqrt(1./self.likelihood.variance) #compute log marginal bound bound = -0.5*tf.cast(num_data*output_dim, tf.float64)*np.log(2*np.pi) bound += -tf.cast(output_dim, tf.float64)*tf.reduce_sum(tf.log(tf.user_ops.get_diag(LB))) bound += -0.5*tf.cast(num_data*output_dim, tf.float64)*tf.log(self.likelihood.variance) bound += -0.5*tf.reduce_sum(tf.square(err))/self.likelihood.variance bound += 0.5*tf.reduce_sum(tf.square(c)) bound += -0.5*(tf.reduce_sum(Kdiag)/self.likelihood.variance - tf.reduce_sum(tf.user_ops.get_diag(AAT))) return bound
def gauss_kl_diag(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum(tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def testNotInvertible(self): # The input should be invertible. with self.test_session(): with self.assertRaisesOpError("Input matrix is not invertible."): # The matrix has a zero on the diagonal. matrix = tf.constant([[1., 0., -1.], [-1., 0., 1.], [0., -1., 1.]]) tf.matrix_triangular_solve(matrix, matrix).eval()
def testNonSquareMatrix(self): # When the solve of a non-square matrix is attempted we should return # an error with self.test_session(): with self.assertRaises(ValueError): matrix = tf.constant([[1., 2., 3.], [3., 4., 5.]]) tf.matrix_triangular_solve(matrix, matrix)
def build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] err = self.Y - self.mean_function(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 Kus = self.kern.K(self.Z, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, tf.transpose(A)) + eye(num_inducing) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def testWrongDimensions(self): # The matrix and rhs should have the same number of rows as the # right-hand sides. with self.test_session(): matrix = tf.constant([[1., 0.], [0., 1.]]) rhs = tf.constant([[1., 0.]]) with self.assertRaises(ValueError): tf.matrix_triangular_solve(matrix, rhs)
def _expectation(p, kern1, feat1, kern2, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - Ka_{.,.}, Kb_{.,.} :: RBF kernels Ka and Kb as well as Z1 and Z2 can differ from each other, but this is supported only if the Gaussian p is Diagonal (p.cov NxD) and Ka, Kb have disjoint active_dims in which case the joint expectations simplify into a product of expectations :return: NxMxM """ if kern1.on_separate_dims(kern2) and isinstance(p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (kern1, feat1)) eKxz2 = expectation(p, (kern2, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2 or kern1 != kern2: raise NotImplementedError("The expectation over two kernels has only an " "analytical implementation if both kernels are equal.") kern = kern1 feat = feat1 with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov(tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov) Z, Xmu = kern._slice(feat.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] squared_lengthscales = kern.lengthscales ** 2. if kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales ** 2. sqrt_det_L = tf.reduce_prod(0.5 * squared_lengthscales) ** 0.5 C = tf.cholesky(0.5 * tf.matrix_diag(squared_lengthscales) + Xcov) # NxDxD dets = sqrt_det_L / tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(C)), axis=1)) # N C_inv_mu = tf.matrix_triangular_solve(C, tf.expand_dims(Xmu, 2), lower=True) # NxDx1 C_inv_z = tf.matrix_triangular_solve(C, tf.tile(tf.expand_dims(tf.transpose(Z) / 2., 0), [N, 1, 1]), lower=True) # NxDxM mu_CC_inv_mu = tf.expand_dims(tf.reduce_sum(tf.square(C_inv_mu), 1), 2) # Nx1x1 z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1) # NxM zm_CC_inv_zn = tf.matmul(C_inv_z, C_inv_z, transpose_a=True) # NxMxM two_z_CC_inv_mu = 2 * tf.matmul(C_inv_z, C_inv_mu, transpose_a=True)[:, :, 0] # NxM exponent_mahalanobis = mu_CC_inv_mu + tf.expand_dims(z_CC_inv_z, 1) + \ tf.expand_dims(z_CC_inv_z, 2) + 2 * zm_CC_inv_zn - \ tf.expand_dims(two_z_CC_inv_mu, 2) - tf.expand_dims(two_z_CC_inv_mu, 1) # NxMxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxMxM # Compute sqrt(self.K(Z)) explicitly to prevent automatic gradient from # being NaN sometimes, see pull request #615 kernel_sqrt = tf.exp(-0.25 * kern.square_dist(Z, None)) return kern.variance ** 2 * kernel_sqrt * \ tf.reshape(dets, [N, 1, 1]) * exponent_mahalanobis
def build_predict(self,X_new,full_cov=False): Kx = self.RBF(self.X_train, X_new) #Kuu = self.RBF(self.X_train,self.X_train) L = tf.cholesky(self.condition(self.Kuu)) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y_train) fmean = tf.matmul(A, V, transpose_a=True) + self.age_mean if full_cov: fvar = self.RBF(X_new,X_new) - tf.matmul(A, A, transpose_a=True) + tf.exp(self.variance_output) * self.eye(X_new.shape[0]) else: fvar = tf.diag_part(self.RBF(X_new,X_new) - tf.matmul(A, A, transpose_a=True)) + tf.exp(self.variance_output) return fmean,fvar
def _expectation(p, kern, feat, none1, none2, nghp=None): """ Compute the expectation: <K_{X, Z}>_p(X) - K_{.,.} :: RBF kernel :return: NxM """ with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov(p.cov) Z, Xmu = kern._slice(feat.Z, p.mu) D = tf.shape(Xmu)[1] if kern.ARD: lengthscales = kern.lengthscales else: lengthscales = tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(Z) - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N return kern.variance * (determinants[:, None] * exponent_mahalanobis)
def _build_predict(self, Xnew, full_cov=False): """ The posterior variance of F is given by q(f) = N(f | K alpha + mean, [K^-1 + diag(lambda**2)]^-1) Here we project this to F*, the values of the GP at Xnew which is given by q(F*) = N ( F* | K_{*F} alpha + mean, K_{**} - K_{*f}[K_{ff} + diag(lambda**-2)]^-1 K_{f*} ) """ # compute kernel things Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) # predictive mean f_mean = tf.matmul(Kx, self.q_alpha, transpose_a=True) + self.mean_function(Xnew) # predictive var A = K + tf.matrix_diag(tf.transpose(1. / tf.square(self.q_lambda))) L = tf.cholesky(A) Kx_tiled = tf.tile(tf.expand_dims(Kx, 0), [self.num_latent, 1, 1]) LiKx = tf.matrix_triangular_solve(L, Kx_tiled) if full_cov: f_var = self.kern.K(Xnew) - tf.matmul(LiKx, LiKx, transpose_a=True) else: f_var = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(LiKx), 1) return f_mean, tf.transpose(f_var)
def _build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(tf.eye(self.num_data, dtype=settings.float_type), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda), 1) f_var = 1. / tf.square(self.q_lambda) - tf.transpose(tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha * self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def eKzxKxz(self, Z, Xmu, Xcov): """ Also known as Phi_2. :param Z: MxD :param Xmu: X mean (NxD) :param Xcov: X covariance matrices (NxDxD) :return: NxMxM """ # use only active dimensions Xcov = self._slice_cov(Xcov) Z, Xmu = self._slice(Z, Xmu) M = tf.shape(Z)[0] N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] lengthscales = self.lengthscales if self.ARD else tf.zeros( (D, ), dtype=float_type) + self.lengthscales Kmms = tf.sqrt(self.K(Z, presliced=True)) / self.variance**0.5 scalemat = tf.expand_dims(eye(D), 0) + 2 * Xcov * tf.reshape( lengthscales**-2.0, [1, 1, -1]) # NxDxD det = tf.matrix_determinant(scalemat) mat = Xcov + 0.5 * tf.expand_dims(tf.diag(lengthscales**2.0), 0) # NxDxD cm = tf.cholesky(mat) # NxDxD vec = 0.5 * (tf.reshape(Z, [1, M, 1, D]) + tf.reshape( Z, [1, 1, M, D])) - tf.reshape(Xmu, [N, 1, 1, D]) # NxMxMxD cmr = tf.tile(tf.reshape(cm, [N, 1, 1, D, D]), [1, M, M, 1, 1]) # NxMxMxDxD smI_z = tf.matrix_triangular_solve(cmr, tf.expand_dims(vec, 4)) # NxMxMxDx1 fs = tf.reduce_sum(tf.square(smI_z), [3, 4]) return self.variance**2.0 * tf.expand_dims(Kmms, 0) * tf.exp( -0.5 * fs) * tf.reshape(det**-0.5, [N, 1, 1])
def build_likelihood_terms(self): Kdiag = reduce( tf.multiply, [k.Kdiag(self.X[:, i:i + 1]) for i, k in enumerate(self.kerns)]) Kuu = [ make_Kuu(k, a, b, self.ms) for k, a, b, in zip(self.kerns, self.a, self.b) ] Kuu_solid = kron([Kuu_d.get() for Kuu_d in Kuu]) Kuu_inv_solid = kron([Kuu_d.inv().get() for Kuu_d in Kuu]) sigma2 = self.likelihood.variance # Compute intermediate matrices P = self.KufKfu / sigma2 + Kuu_solid L = tf.cholesky(P) log_det_P = tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) c = tf.matrix_triangular_solve(L, self.KufY) / sigma2 Kuu_logdets = [K.logdet() for K in Kuu] N_others = [float(np.prod(self.Ms)) / M for M in self.Ms] Kuu_logdet = reduce( tf.add, [N * logdet for N, logdet in zip(N_others, Kuu_logdets)]) # compute log marginal bound ND = tf.cast(tf.size(self.Y), float_type) D = tf.cast(tf.shape(self.Y)[1], float_type) return (-0.5 * ND * tf.log(2 * np.pi * sigma2), -0.5 * D * log_det_P, 0.5 * D * Kuu_logdet, -0.5 * self.tr_YTY / sigma2, 0.5 * tf.reduce_sum(tf.square(c)), -0.5 * tf.reduce_sum(Kdiag) / sigma2, 0.5 * tf.reduce_sum(Kuu_inv_solid * self.KufKfu) / sigma2)
def multivariate_normal(x, mu, L): """ Computes the log-density of a multivariate normal. :param x : Dx1 or DxN sample(s) for which we want the density :param mu : Dx1 or DxN mean(s) of the normal distribution :param L : DxD Cholesky decomposition of the covariance matrix :return p : (1,) or (N,) vector of log densities for each of the N x's and/or mu's x and mu are either vectors or matrices. If both are vectors (N,1): p[0] = log pdf(x) where x ~ N(mu, LL^T) If at least one is a matrix, we assume independence over the *columns*: the number of rows must match the size of L. Broadcasting behaviour: p[n] = log pdf of: x[n] ~ N(mu, LL^T) or x ~ N(mu[n], LL^T) or x[n] ~ N(mu[n], LL^T) """ if x.shape.ndims is None: warnings.warn('Shape of x must be 2D at computation.') elif x.shape.ndims != 2: raise ValueError('Shape of x must be 2D.') if mu.shape.ndims is None: warnings.warn('Shape of mu may be unknown or not 2D.') elif mu.shape.ndims != 2: raise ValueError('Shape of mu must be 2D.') d = x - mu alpha = tf.matrix_triangular_solve(L, d, lower=True) num_dims = tf.cast(tf.shape(d)[0], L.dtype) p = - 0.5 * tf.reduce_sum(tf.square(alpha), 0) p -= 0.5 * num_dims * np.log(2 * np.pi) p -= tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) return p
def test_whiten(self): """ make sure that predicting using the whitened representation is the sameas the non-whitened one. """ with self.test_context() as sess: Xs, X, F, k, num_data, feed_dict = self.prepare() k.compile(session=sess) K = k.K(X) + tf.eye(num_data, dtype=settings.float_type) * 1e-6 L = tf.cholesky(K) V = tf.matrix_triangular_solve(L, F, lower=True) Fstar_mean, Fstar_var = gpflow.conditionals.conditional( Xs, X, k, F) Fstar_w_mean, Fstar_w_var = gpflow.conditionals.conditional( Xs, X, k, V, white=True) mean1, var1 = sess.run([Fstar_w_mean, Fstar_w_var], feed_dict=feed_dict) mean2, var2 = sess.run([Fstar_mean, Fstar_var], feed_dict=feed_dict) # TODO: should tolerance be type dependent? assert_allclose(mean1, mean2) assert_allclose(var1, var2)
def build_likelihood(self): num_data = tf.shape(self.Y)[0] output_dim = tf.shape(self.Y)[1] total_variance = reduce(tf.add, [k.variance for k in self.kerns]) Kuu = [ make_Kuu(k, ai, bi, self.ms) for k, ai, bi in zip(self.kerns, self.a, self.b) ] Kuu = BlockDiagMat_many([mat for k in Kuu for mat in [k.A, k.B]]) sigma2 = self.likelihood.variance # Compute intermediate matrices P = self.KufKfu / sigma2 + Kuu.get() L = tf.cholesky(P) log_det_P = tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) c = tf.matrix_triangular_solve(L, self.KufY) / sigma2 # compute log marginal bound ND = tf.cast(num_data * output_dim, float_type) D = tf.cast(output_dim, float_type) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_P bound += 0.5 * D * Kuu.logdet() bound += -0.5 * self.tr_YTY / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * ND * total_variance / sigma2 bound += 0.5 * D * Kuu.trace_KiX(self.KufKfu) / sigma2 return bound
def get_cholesky_solve_terms(Z, C=C): C_inv_z = tf.matrix_triangular_solve( C, tf.tile(tf.expand_dims(tf.transpose(Z), 0), [N, 1, 1]), lower=True) # [N, D, M] z_CC_inv_z = tf.reduce_sum(tf.square(C_inv_z), 1) # [N, M] return C_inv_z, z_CC_inv_z
def update_precond_type2(Q1, Q2, q3, dx, dg, step=0.01): """ update type II limited-memory preconditioner P = Q'*Q, where the Cholesky factor is a block matrix, Q = [Q1, Q2; 0, diag(q3)] This preconditioner requires limited memory if Q1(Q2) only has a few rows """ r = Q1.shape.as_list()[0] #max_diag = tf.maximum(tf.reduce_max(tf.diag_part(Q1)), tf.reduce_max(q3)) #Q1 = Q1 + tf.diag(tf.clip_by_value(_diag_loading*max_diag - tf.diag_part(Q1), 0.0, max_diag)) #q3 = q3 + tf.clip_by_value(_diag_loading*max_diag - q3, 0.0, max_diag) a1 = tf.matmul(Q1, dg[:r]) + tf.matmul(Q2, dg[r:]) a2 = tf.multiply(q3, dg[r:]) b1 = tf.matrix_triangular_solve(tf.transpose(Q1), dx[:r], lower=True) b2 = tf.divide(dx[r:] - tf.matmul(Q2, b1, transpose_a=True), q3) grad1 = tf.matrix_band_part( tf.matmul(a1, a1, transpose_b=True) - tf.matmul(b1, b1, transpose_b=True), 0, -1) grad2 = tf.matmul(a1, a2, transpose_b=True) - tf.matmul( b1, b2, transpose_b=True) grad3 = tf.multiply(a2, a2) - tf.multiply(b2, b2) max_abs_grad = tf.reduce_max(tf.abs(grad1)) max_abs_grad = tf.maximum(max_abs_grad, tf.reduce_max(tf.abs(grad2))) max_abs_grad = tf.maximum(max_abs_grad, tf.reduce_max(tf.abs(grad3))) step0 = step / (max_abs_grad + _tiny) return Q1 - tf.matmul(step0*grad1, Q1), \ Q2 - tf.matmul(step0*grad1, Q2) - tf.multiply(step0*grad2, tf.tile(tf.transpose(q3), [r,1])), \ q3 - tf.multiply(step0*grad3, q3)
def _build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile( tf.expand_dims(tf.eye(self.num_data, dtype=settings.float_type), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda), 1) f_var = 1. / tf.square(self.q_lambda) - tf.transpose( tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha * self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def build_predict(self, Xnew, full_cov=False): """ The posterior variance of F is given by q(f) = N(f | K alpha, [K^-1 + diag(lambda**2)]^-1) Here we project this to F*, the values of the GP at Xnew which is given by q(F*) = N ( F* | K_{*F} alpha , K_{**} - K_{*f}[K_{ff} + diag(lambda**-2)]^-1 K_{f*} ) """ #compute kernelly things Kx = self.kern.K(Xnew, self.X) K = self.kern.K(self.X) #predictive mean f_mean = tf.matmul(Kx, self.q_alpha) + self.mean_function(Xnew) #predictive var f_var = [] for d in range(self.num_latent): b = self.q_lambda[:,d] A = K + tf.diag(1./tf.square(b)) L = tf.cholesky(A) LiKx = tf.matrix_triangular_solve(L, tf.transpose(Kx), lower=True) if full_cov: f_var.append( self.kern.K(Xnew)- tf.matmul(tf.transpose(LiKx),LiKx) ) else: f_var.append( self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(LiKx),0) ) f_var = tf.pack(f_var) return f_mean, tf.transpose(f_var)
def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None, use_gpu=False): for np_type in [np.float32, np.float64]: a = x.astype(np_type) b = y.astype(np_type) # For numpy.solve we have to explicitly zero out the strictly # upper or lower triangle. if lower and a.size > 0: a_np = np.tril(a) elif a.size > 0: a_np = np.triu(a) else: a_np = a if adjoint: a_np = np.conj(np.transpose(a_np)) if batch_dims is not None: a = np.tile(a, batch_dims + [1, 1]) a_np = np.tile(a_np, batch_dims + [1, 1]) b = np.tile(b, batch_dims + [1, 1]) with self.test_session(use_gpu=use_gpu): tf_ans = tf.matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint) out = tf_ans.eval() np_ans = np.linalg.solve(a_np, b) self.assertEqual(np_ans.shape, tf_ans.get_shape()) self.assertEqual(np_ans.shape, out.shape) self.assertAllClose(np_ans, out)
def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower lound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) f_mean = tf.matmul(K, self.q_alpha) + self.mean_function(self.X) #for each of the data-dimensions (columns of Y), find the diagonal of the #variance, and also relevant parts of the KL. f_var, A_logdet, trAi = [], tf.zeros((1,), tf.float64), tf.zeros((1,), tf.float64) for d in range(self.num_latent): b = self.q_lambda[:,d] B = tf.expand_dims(b, 1) A = eye(self.num_data) + K*B*tf.transpose(B) L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, eye(self.num_data), lower=True) LiBi = Li / b #full_sigma:return tf.diag(b**-2) - LiBi.T.dot(LiBi) f_var.append(1./tf.square(b) - tf.reduce_sum(tf.square(LiBi),0)) A_logdet += 2*tf.reduce_sum(tf.log(tf.user_ops.get_diag(L))) trAi += tf.reduce_sum(tf.square(Li)) f_var = tf.transpose(tf.pack(f_var)) KL = 0.5*(A_logdet + trAi - self.num_data*self.num_latent + tf.reduce_sum(f_mean*self.q_alpha)) return tf.reduce_sum(self.likelihood.variational_expectations(f_mean, f_var, self.Y)) - KL
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ num_inducing = tf.shape(self.Z)[0] psi0 = tf.reduce_sum(self.kern.eKdiag(self.X_mean, self.X_var), 0) psi1 = self.kern.eKxz(self.Z, self.X_mean, self.X_var) psi2 = tf.reduce_sum( self.kern.eKzxKxz(self.Z, self.X_mean, self.X_var), 0) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(x) || p(x)] dX_var = self.X_var if len( self.X_var.get_shape()) == 2 else tf.matrix_diag_part(self.X_var) NQ = tf.cast(tf.size(self.X_mean), float_type) D = tf.cast(tf.shape(self.Y)[1], float_type) KL = -0.5 * tf.reduce_sum(tf.log(dX_var)) \ + 0.5 * tf.reduce_sum(tf.log(self.X_prior_var)) \ - 0.5 * NQ \ + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + dX_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(self.Y), float_type) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.diag_part(AAT))) bound -= KL return bound
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] num_data = tf.cast(tf.shape(self.Y)[0], settings.dtypes.float_type) output_dim = tf.cast(tf.shape(self.Y)[1], settings.dtypes.float_type) err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + tf.eye( num_inducing, dtype=float_type) * settings.numerics.jitter_level L = tf.cholesky(Kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma AAT = tf.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += -output_dim * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum( tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * output_dim * tf.reduce_sum( Kdiag) / self.likelihood.variance bound += 0.5 * output_dim * tf.reduce_sum(tf.matrix_diag_part(AAT)) if self.reg: # add regularization beta = 1000. regularization = -beta * reduce( tf.add, map(tf.abs, self.kern.var_vector)) # L-1 norm return bound + regularization else: return bound
def build_predict(self, Xnew, full_cov=False): # w = w./repmat(ell',[m,1]); % scaled model angular frequencies w = self.omega / self.kern.lengthscales m = tf.shape(self.omega)[0] m_float = tf.cast(m, tf.float64) # phi = x_tr*w'; phi = tf.matmul(self.X, tf.transpose(w)) # phi = [cos(phi) sin(phi)]; % design matrix phi = tf.concat([tf.cos(phi), tf.sin(phi)], axis=1) # R = chol((sf2/m)*(phi'*phi) + sn2*eye(2*m)); % calculate some often-used constants A = (self.kern.variance / m_float) * tf.matmul(tf.transpose(phi), phi)\ + self.likelihood.variance * gpflow.tf_wraps.eye(2*m) RT = tf.cholesky(A) R = tf.transpose(RT) # RtiPhit = PhiRi'; RtiPhit = tf.matrix_triangular_solve(RT, tf.transpose(phi)) # Rtiphity=RtiPhit*y_tr; Rtiphity = tf.matmul(RtiPhit, self.Y) # alfa=sf2/m*(R\Rtiphity); % cosines/sines coefficients alpha = self.kern.variance / m_float * tf.matrix_triangular_solve(R, Rtiphity, lower=False) # phistar = x_tst*w'; phistar = tf.matmul(Xnew, tf.transpose(w)) # phistar = [cos(phistar) sin(phistar)]; % test design matrix phistar = tf.concat([tf.cos(phistar), tf.sin(phistar)], axis=1) # out1(beg_chunk:end_chunk) = phistar*alfa; % Predictive mean mean = tf.matmul(phistar, alpha) # % also output predictive variance # out2(beg_chunk:end_chunk) = sn2*(1+sf2/m*sum((phistar/R).^2,2));% Predictive variance RtiPhistart = tf.matrix_triangular_solve(RT, tf.transpose(phistar)) PhiRistar = tf.transpose(RtiPhistart) # NB: do not add in noise variance to the predictive var: gpflow does that for us. if full_cov: var = self.likelihood.variance * self.kern.variance / m_float *\ tf.matmul(PhiRistar, tf.transpose(PhiRistar)) + \ gpflow.tf_wraps.eye(tf.shape(Xnew)[0]) * 1e-6 var = tf.expand_dims(var, 2) else: var = self.likelihood.variance * self.kern.variance / m_float * tf.reduce_sum(tf.square(PhiRistar), 1) var = tf.expand_dims(var, 1) return mean, var
def __init__(self, prec_mean, prec, d=None): prec_mean = tf.convert_to_tensor(prec_mean) prec = tf.convert_to_tensor(prec) try: d1, = util.extract_shape(prec_mean) prec_mean = tf.reshape(prec_mean, (d1, 1)) except: d1, k = util.extract_shape(prec_mean) assert (k == 1) d2, _ = util.extract_shape(prec) assert (d1 == d2) if d is None: d = d1 else: assert (d == d1) super(MVGaussianNatural, self).__init__(d=d) self._prec_mean = prec_mean self._prec = prec self._L_prec = tf.cholesky(prec) self._entropy = util.dists.multivariate_gaussian_entropy( L_prec=self._L_prec) # want to solve prec * mean = prec_mean for mean. # this is equiv to (LL') * mean = prec_mean. # since tf doesn't have a cholSolve shortcut, just # do it directly: # solve L y = prec_mean # to get y = (L' * mean), then # solve L' mean = y y = tf.matrix_triangular_solve(self._L_prec, self._prec_mean, lower=True, adjoint=False) self._mean = tf.matrix_triangular_solve(self._L_prec, y, lower=True, adjoint=True) L_cov_transpose = util.triangular_inv(self._L_prec) self._L_cov = tf.transpose(L_cov_transpose) self._cov = tf.matmul(self._L_cov, L_cov_transpose)
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ ND = tf.cast(tf.size(self.Y), settings.float_type) D = tf.cast(tf.shape(self.Y)[1], settings.float_type) Kxu = self.kern.K(self.X, self.feature.Z) psi0 = self._psi0() psi1 = self._psi1(Kxu) psi2 = self._psi2(Kxu) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) L = block_diagonal([L for _ in range(self.W.shape[1])]) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(self.num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(W) || p(W)] KL = tf.reduce_sum(self.Wnorm() * (tf.log(self.Wnorm()) - tf.log(self.W_prior))) # compute log marginal bound bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.matrix_diag_part(AAT))) bound -= KL return bound
def inv(self): di = tf.reciprocal(self.d) d_col = tf.expand_dims(self.d, 1) DiW = self.W / d_col M = tf.eye(tf.shape(self.W)[1], float_type) + tf.matmul(tf.transpose(DiW), self.W) L = tf.cholesky(M) v = tf.transpose(tf.matrix_triangular_solve(L, tf.transpose(DiW), lower=True)) return LowRankMatNeg(di, V)
def predict_components(self, Xnew): """ Here, Xnew should be a Nnew x 1 array of points at which to test each function """ Kuu = [ make_Kuu(k, ai, bi, self.ms) for k, ai, bi in zip(self.kerns, self.a, self.b) ] Kuu = BlockDiagMat_many([mat for k in Kuu for mat in [k.A, k.B]]) sigma2 = self.likelihood.variance # Compute intermediate matrices P = self.KufKfu / sigma2 + Kuu.get() L = tf.cholesky(P) c = tf.matrix_triangular_solve(L, self.KufY) / sigma2 Kus_blocks = [ make_Kuf(k, Xnew, a, b, self.ms) for i, (k, a, b) in enumerate(zip(self.kerns, self.a, self.b)) ] Kus = [] start = tf.constant(0, tf.int32) for i, b in enumerate(Kus_blocks): zeros_above = tf.zeros(tf.pack([start, tf.shape(b)[1]]), float_type) zeros_below = tf.zeros( tf.pack( [tf.shape(L)[0] - start - tf.shape(b)[0], tf.shape(b)[1]]), float_type) Kus.append(tf.concat(0, [zeros_above, b, zeros_below])) start = start + tf.shape(b)[0] tmp = [tf.matrix_triangular_solve(L, Kus_i) for Kus_i in Kus] mean = [tf.matmul(tf.transpose(tmp_i), c) for tmp_i in tmp] KiKus = [Kuu.solve(Kus_i) for Kus_i in Kus] var = [k.Kdiag(Xnew[:, i:i + 1]) for i, k in enumerate(self.kerns)] var = [ v + tf.reduce_sum(tf.square(tmp_i), 0) for v, tmp_i in zip(var, tmp) ] var = [ v - tf.reduce_sum(KiKus_i * Kus_i, 0) for v, KiKus_i, Kus_i in zip(var, KiKus, Kus) ] var = [tf.expand_dims(v, 1) for v in var] return tf.concat(1, mean), tf.concat(1, var)
def dmvnorm(y, mean, sigma): L = tf.cholesky(sigma) kern_sqr = tf.matrix_triangular_solve(L, y - mean, lower=True) n = tf.cast(tf.shape(sigma)[1], tf.float32) loglike = -0.5 * n * tf.log(2.0 * np.pi) loglike += -tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) loglike += -0.5 * tf.reduce_sum(tf.square(kern_sqr)) return (loglike)
def neglogp(self, x): delta = tf.expand_dims(x - self.mean, axis=-1) stds = 0*delta + self.std half_quadratic = tf.matrix_triangular_solve(stds, delta, lower=False) quadratic = tf.matmul(half_quadratic, half_quadratic, transpose_a=True) return 0.5 * (self.log_det_cov + quadratic + self.size*tf.log(2*tf.constant(np.pi)))
def _expectation(p, rbf_kern, feat1, lin_kern, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - K_lin_{.,.} :: RBF kernel - K_rbf_{.,.} :: Linear kernel Different Z1 and Z2 are handled if p is diagonal and K_lin and K_rbf have disjoint active_dims, in which case the joint expectations simplify into a product of expectations :return: NxM1xM2 """ if rbf_kern.on_separate_dims(lin_kern) and isinstance(p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (rbf_kern, feat1)) eKxz2 = expectation(p, (lin_kern, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2: raise NotImplementedError("Features have to be the same for both kernels.") if rbf_kern.active_dims != lin_kern.active_dims: raise NotImplementedError("active_dims have to be the same for both kernels.") with params_as_tensors_for(rbf_kern), params_as_tensors_for(lin_kern), \ params_as_tensors_for(feat1), params_as_tensors_for(feat2): # use only active dimensions Xcov = rbf_kern._slice_cov(tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov) Z, Xmu = rbf_kern._slice(feat1.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] lin_kern_variances = lin_kern.variance if lin_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + lin_kern.variance rbf_kern_lengthscales = rbf_kern.lengthscales if rbf_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + rbf_kern.lengthscales ## Begin RBF eKxz code: chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(rbf_kern_lengthscales ** 2) + Xcov) # NxDxD Z_transpose = tf.transpose(Z) all_diffs = Z_transpose - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(rbf_kern_lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N eKxz_rbf = rbf_kern.variance * (determinants[:, None] * exponent_mahalanobis) ## NxM <- End RBF eKxz code tiled_Z = tf.tile(tf.expand_dims(Z_transpose, 0), (N, 1, 1)) # NxDxM z_L_inv_Xcov = tf.matmul(tiled_Z, Xcov / rbf_kern_lengthscales[:, None] ** 2., transpose_a=True) # NxMxD cross_eKzxKxz = tf.cholesky_solve( chol_L_plus_Xcov, (lin_kern_variances * rbf_kern_lengthscales ** 2.)[..., None] * tiled_Z) # NxDxM cross_eKzxKxz = tf.matmul((z_L_inv_Xcov + Xmu[:, None, :]) * eKxz_rbf[..., None], cross_eKzxKxz) # NxMxM return cross_eKzxKxz
def _matrix_triangular_solve_tensor(self, other, lower): """ Solve self @ x = other for x when other is a full tensor Matrix sizes: A : n x m A[i] : n_i x m_i X : m x p rhs : n x p Recursive algorithm based on Bilionis et al., "Multi-output separable Gaussian process: Towards an efficient, fully Bayesian paradigm for uncertainty quantification" (2013) :param other: the right-hand side of the system of equations :type other: tf.Tensor :param lower: whether self is a lower (True) or upper (False) triangular matrix :type lower: bool :return: (KroneckerProduct) """ assert lower, "upper triangular not implemented" if self.k == 1: return tf.matrix_triangular_solve(self.x[0], other, lower) else: n = self.shape[0] p = other.shape[1] n_0 = int(self.x[0].shape[0]) n_prime = n // n_0 a_prime = KroneckerProduct(self.x[1:]) a_0 = self.x[0] x_cols = [] for i in range(p): # See KP times matrix for notes about Fortran-style reshaping... x1i = a_prime.matrix_triangular_solve(tf.transpose( tf.reshape(other[:, i], (n_0, n_prime))), lower) # Note: The formula has a transpose before vectorizing. # However, F-style reshape needs a transpose as well. # So, they cancel and no transpose is carried out after trtrs. x_cols.append(tf.reshape( tf.matrix_triangular_solve(a_0, tf.transpose(x1i), lower), [-1])) return tf.stack(x_cols, 1)
def conditional_ND_not_share_Z(self, X, full_cov=False): mean_lst, var_lst, A_tiled_lst = [], [], [] for nd in range(self.num_nodes): pa_nd = self.pa_idx(nd) X_tmp = tf.gather(X, pa_nd, axis=1) Kuf_nd = self.feature[nd].Kuf(self.kern[nd], X_tmp) A_nd = tf.matrix_triangular_solve(self.Lu[nd], Kuf_nd, lower=True) A_nd = tf.matrix_triangular_solve(tf.transpose(self.Lu[nd]), A_nd, lower=False) mean_tmp = tf.matmul(A_nd, self.q_mu[:, nd * self.dim_per_out:(nd + 1) * self.dim_per_out], transpose_a=True) if self.nb_init: mean_tmp += self.mean_function[nd](X_tmp) else: mean_tmp += self.mean_function[nd]( X[:, nd * self.dim_per_in:(nd + 1) * self.dim_per_in]) mean_lst.append(mean_tmp) A_tiled_lst.append( tf.tile(A_nd[None, :, :], [self.dim_per_out, 1, 1])) SK_nd = -self.Ku_tiled_lst[nd] q_sqrt_nd = self.q_sqrt_lst[nd] with params_as_tensors_for(q_sqrt_nd, convert=True): SK_nd += tf.matmul(q_sqrt_nd, q_sqrt_nd, transpose_b=True) B_nd = tf.matmul(SK_nd, A_tiled_lst[nd]) # (num_latent, num_X) delta_cov_nd = tf.reduce_sum(A_tiled_lst[nd] * B_nd, 1) Kff_nd = self.kern[nd].Kdiag(X_tmp) # (1, num_X) + (num_latent, num_X) var_nd = tf.expand_dims(Kff_nd, 0) + delta_cov_nd var_nd = tf.transpose(var_nd) var_lst.append(var_nd) mean = tf.concat(mean_lst, axis=1) var = tf.concat(var_lst, axis=1) return mean, var
def _compute_cache(self): K = self.kern.K(self.X) + tf.eye( tf.shape(self.X)[0], dtype=settings.float_type) * self.likelihood.variance L = tf.cholesky(K, name='gp_cholesky') V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X), name='gp_alpha') return L, V
def _forward(self, x): with tf.control_dependencies(self._assertions(x)): x_shape = tf.shape(x) identity_matrix = tf.eye( x_shape[-1], batch_shape=x_shape[:-2], dtype=x.dtype.base_dtype) # Note `matrix_triangular_solve` implicitly zeros upper triangular of `x`. y = tf.matrix_triangular_solve(x, identity_matrix) y = tf.matmul(y, y, adjoint_a=True) return tf.cholesky(y)
def multivariate_gaussian_log_density(x, mu, Sigma=None, L=None, prec=None, L_prec=None): """ Assume X is a single vector described by a multivariate Gaussian distribution with x ~ N(mu, Sigma). We accept parameterization in terms of the covariance matrix or its cholesky decomposition L (more efficient if available), or the precision matrix or its cholesky decomposition L_prec. The latter is useful when representing a Gaussian in its natural parameterization. Note that we still require the explicit mean mu (not the natural parameter prec*mu) since I'm too lazy to cover all the permutations of possible arguments (though this should be straightforward). """ s = extract_shape(x) try: n, = s except: n, m = s assert (m == 1) if L is None and Sigma is not None: L = tf.cholesky(Sigma) if L_prec is None and prec is not None: L_prec = tf.cholesky(prec) if L is not None: neg_half_logdet = -tf.reduce_sum(tf.log(tf.diag_part(L))) else: assert (L_prec is not None) neg_half_logdet = tf.reduce_sum(tf.log(tf.diag_part(L_prec))) d = tf.reshape(x - mu, (n, 1)) if L is not None: alpha = tf.matrix_triangular_solve(L, d, lower=True) exponential_part = tf.reduce_sum(tf.square(alpha)) elif prec is not None: d = tf.reshape(d, (n, 1)) exponential_part = tf.reduce_sum(d * tf.matmul(prec, d)) else: assert (L_prec is not None) d = tf.reshape(d, (1, n)) alpha = tf.matmul(d, L_prec) exponential_part = tf.reduce_sum(tf.square(alpha)) n_log2pi = n * 1.83787706641 logp = -0.5 * n_log2pi logp += neg_half_logdet logp += -0.5 * exponential_part return logp
def test_whitening(self): with self.test_context() as sess: mu = tf.placeholder(FLOAT_TYPE, shape=(self.D, self.M)) Q_chol = tf.placeholder(FLOAT_TYPE, shape=(self.D, self.M, self.M)) P_chol = tf.placeholder(FLOAT_TYPE, shape=(self.D, self.M, self.M)) feed_dict = self.get_feed_dict([mu], [Q_chol], [P_chol]) KL_black = sess.run(KL(mu, Q_chol, P_chol=P_chol), feed_dict) KL_white = sess.run( KL( tf.matrix_triangular_solve(P_chol, mu[:, :, None], lower=True)[..., 0], tf.matrix_triangular_solve(P_chol, Q_chol, lower=True)), feed_dict) assert_allclose(KL_black, KL_white)
def build_posterior_mean_var(self, X, Y, test_points, full_cov=False): noise_var = self.likelihood.variance.tensor Kx = self.kernel.K(X, test_points) K = self.kernel.K(X) K += tfhacks.eye(tf.shape(X)[0], X.dtype) * noise_var L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, Y - self.meanfunction(X)) fmean = tf.matmul(A, V, transpose_a=True) fmean += self.meanfunction(test_points) if full_cov: fvar = self.kernel.K(test_points) - tf.matmul(A, A, transpose_a=1) fvar = tf.tile(tf.expand_dims(fvar, 2), (1, 1, tf.shape(Y)[1])) else: fvar = self.kernel.Kdiag(test_points) fvar -= tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.expand_dims(fvar, 1), (1, tf.shape(Y)[1])) return fmean, fvar
def _build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ pX = DiagonalGaussian(self.X_mean, self.X_var) num_inducing = len(self.feature) psi0 = tf.reduce_sum(expectation(pX, self.kern)) psi1 = expectation(pX, (self.feature, self.kern)) psi2 = tf.reduce_sum(expectation(pX, (self.feature, self.kern), (self.feature, self.kern)), axis=0) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(x) || p(x)] dX_var = self.X_var if len(self.X_var.get_shape()) == 2 else tf.matrix_diag_part(self.X_var) NQ = tf.cast(tf.size(self.X_mean), settings.float_type) D = tf.cast(tf.shape(self.Y)[1], settings.float_type) KL = -0.5 * tf.reduce_sum(tf.log(dX_var)) \ + 0.5 * tf.reduce_sum(tf.log(self.X_prior_var)) \ - 0.5 * NQ \ + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + dX_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(self.Y), settings.float_type) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.matrix_diag_part(AAT))) bound -= KL return bound
def __init__(self, D_IN, D_OUT, M=50): """ Initialize GP layer D_IN : dimension of input D_OUT : dimension of output M : the number of inducing points """ self.sig_offset = 1e-5 self.process_sig_offset = 1e-5 self.lambda_offset = 1e-5 self.D_IN = D_IN self.D_OUT = D_OUT self.M = M # Define mean function self._init_mean_function(D_IN, D_OUT) # Kernel Parameters (SE ARE kernel) self.ARD_loglambda = tf.Variable(np.zeros([1, D_IN]), dtype=tf.float32) # (1,Din) self.ARD_lambda = tf.exp(self.ARD_loglambda) + self.lambda_offset self.ARD_logsig0 = tf.Variable(0.0, dtype=tf.float32) self.ARD_var0 = (tf.exp(self.ARD_logsig0) + self.sig_offset)**2 # Inducing Points (Z -> U) self.Z = tf.Variable(np.random.uniform(-3, 3, [M, D_IN]), dtype=tf.float32) # (M,Din) self.GPmean = self.mean(self.Z) self.U_mean = tf.Variable( np.random.uniform(-3, 3, [M, D_OUT]), dtype=tf.float32) # np.random.uniform(-2,2,[M,D_OUT]) # (M,Dout) self.U_logL_diag = tf.Variable(np.zeros([D_OUT, M]), dtype=tf.float32) # (Dout,M) self.U_L_diag = tf.exp(self.U_logL_diag) self.U_L_nondiag = tf.Variable(np.zeros([D_OUT, int(M * (M - 1) / 2)]), dtype=tf.float32) # (Dout,M(M-1)/2) self.U_L = tf.matrix_set_diag(vecs_to_tri(self.U_L_nondiag, M), self.U_L_diag) # (Dout,M,M) self.U_cov = (self.sig_offset**2) * tf.eye(M, batch_shape=[ D_OUT ]) + self.U_L @ tf.transpose(self.U_L, perm=(0, 2, 1)) # (Dout,M,M) # Covariance among inducing points self.Kzz = SEARD(self.Z, self.Z, self.ARD_lambda, self.ARD_var0, M, M, D_IN) + (self.sig_offset**2) * tf.eye(M) # (M,M) self.Kzz_L = tf.cholesky(self.Kzz) self.Kzz_L_inv = tf.matrix_triangular_solve( self.Kzz_L, tf.eye(M), lower=True) # tf.matrix_inverse(self.Kzz_L) self.Kzz_inv = tf.transpose(self.Kzz_L_inv) @ self.Kzz_L_inv # Processing Noise self.logbeta = tf.Variable(np.zeros([1, D_OUT]), dtype=tf.float32) # set as 0.0 self.beta = tf.exp(self.logbeta) + self.process_sig_offset # (1,Dout) self.beta_expand = tf.expand_dims(self.beta, axis=0) # (1,1,Dout)
def conditional_ND(self, X, full_cov=False): self.build_cholesky_if_needed() # mmean, vvar = conditional(X, self.feature.Z, self.kern, # self.q_mu, q_sqrt=self.q_sqrt, # full_cov=full_cov, white=self.white) Kuf = self.feature.Kuf(self.kern, X) A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True) if not self.white: A = tf.matrix_triangular_solve(tf.transpose(self.Lu), A, lower=False) mean = tf.matmul(A, self.q_mu, transpose_a=True) A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :] if self.white: SK = -I else: SK = -self.Ku_tiled if self.q_sqrt is not None: SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) B = tf.matmul(SK, A_tiled) if full_cov: # (num_latent, num_X, num_X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) Kff = self.kern.K(X) else: # (num_latent, num_X) delta_cov = tf.reduce_sum(A_tiled * B, 1) Kff = self.kern.Kdiag(X) # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X) var = tf.expand_dims(Kff, 0) + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var
def _build_likelihood(self): if self.fDebug: print('assignegp_denseSparse compiling model (build_likelihood)') N = tf.cast(tf.shape(self.Y)[0], dtype=settings.float_type) M = tf.shape(self.ZExpanded)[0] D = tf.cast(tf.shape(self.Y)[1], dtype=settings.float_type) Phi = tf.nn.softmax(self.logPhi) # try squashing Phi to avoid numerical errors Phi = (1 - 2e-6) * Phi + 1e-6 sigma2 = self.likelihood.variance sigma = tf.sqrt(self.likelihood.variance) Kuu = self.kern.K(self.ZExpanded) + tf.eye( M, dtype=settings.float_type) * settings.numerics.jitter_level Kuf = self.kern.K(self.ZExpanded, self.X) Kdiag = self.kern.Kdiag(self.X) L = tf.cholesky(Kuu) A = tf.reduce_sum(Phi, 0) LiKuf = tf.matrix_triangular_solve(L, Kuf) W = LiKuf * tf.sqrt(A) / sigma P = tf.matmul(W, tf.transpose(W)) + tf.eye(M, dtype=settings.float_type) traceTerm = -0.5 * tf.reduce_sum( Kdiag * A) / sigma2 + 0.5 * tf.reduce_sum(tf.square(W)) R = tf.cholesky(P) tmp = tf.matmul(LiKuf, tf.matmul(tf.transpose(Phi), self.Y)) c = tf.matrix_triangular_solve(R, tmp, lower=True) / sigma2 if (self.fDebug): # trace term should be 0 for Z=X (full data) traceTerm = tf.Print(traceTerm, [traceTerm], message='traceTerm=', name='traceTerm', summarize=10) self.bound = traceTerm - 0.5*N*D*tf.log(2 * np.pi * sigma2)\ - 0.5*D*tf.reduce_sum(tf.log(tf.square(tf.diag_part(R))))\ - 0.5*tf.reduce_sum(tf.square(self.Y)) / sigma2\ + 0.5*tf.reduce_sum(tf.square(c))\ - self.build_KL(Phi) return self.bound
def multivariate_gaussian_log_density(x, mu, Sigma=None, L=None, prec=None, L_prec=None): """ Assume X is a single vector described by a multivariate Gaussian distribution with x ~ N(mu, Sigma). We accept parameterization in terms of the covariance matrix or its cholesky decomposition L (more efficient if available), or the precision matrix or its cholesky decomposition L_prec. The latter is useful when representing a Gaussian in its natural parameterization. Note that we still require the explicit mean mu (not the natural parameter prec*mu) since I'm too lazy to cover all the permutations of possible arguments (though this should be straightforward). """ s = extract_shape(x) try: n, = s except: n, m = s assert(m==1) if L is None and Sigma is not None: L = tf.cholesky(Sigma) if L_prec is None and prec is not None: L_prec = tf.cholesky(prec) if L is not None: neg_half_logdet = -tf.reduce_sum(tf.log(tf.diag_part(L))) else: assert(L_prec is not None) neg_half_logdet = tf.reduce_sum(tf.log(tf.diag_part(L_prec))) d = tf.reshape(x - mu, (n,1)) if L is not None: alpha = tf.matrix_triangular_solve(L, d, lower=True) exponential_part= tf.reduce_sum(tf.square(alpha)) elif prec is not None: d = tf.reshape(d, (n, 1)) exponential_part = tf.reduce_sum(d * tf.matmul(prec, d)) else: assert(L_prec is not None) d = tf.reshape(d, (1, n)) alpha = tf.matmul(d, L_prec) exponential_part= tf.reduce_sum(tf.square(alpha)) n_log2pi = n * 1.83787706641 logp = -0.5 * n_log2pi logp += neg_half_logdet logp += -0.5 * exponential_part return logp
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False): # compute kernel stuff num_func = tf.shape(f)[1] # K Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = Knn - tf.matmul(A, A, transpose_a=True) shape = tf.stack([num_func, 1, 1]) else: fvar = Knn - tf.reduce_sum(tf.square(A), 0) shape = tf.stack([num_func, 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # K x N x N or K x N # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # K x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(q_sqrt, -1, 0) # K x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # K x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar
def build_likelihood(self): """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ num_inducing = tf.shape(self.Z)[0] psi0, psi1, psi2 = ke.build_psi_stats(self.Z, self.kern, self.X_mean, self.X_var) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) log_det_B = 2. * tf.reduce_sum(tf.log(tf.diag_part(LB))) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma # KL[q(x) || p(x)] NQ = tf.cast(tf.size(self.X_mean), tf.float64) D = tf.cast(tf.shape(self.Y)[1], tf.float64) KL = -0.5*tf.reduce_sum(tf.log(self.X_var)) \ + 0.5*tf.reduce_sum(tf.log(self.X_prior_var))\ - 0.5 * NQ\ + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + self.X_var) / self.X_prior_var) # compute log marginal bound ND = tf.cast(tf.size(self.Y), tf.float64) bound = -0.5 * ND * tf.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(self.Y)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.diag_part(AAT))) bound -= KL return bound
def __init__(self, prec_mean, prec, d=None): prec_mean = tf.convert_to_tensor(prec_mean) prec = tf.convert_to_tensor(prec) try: d1, = util.extract_shape(prec_mean) prec_mean = tf.reshape(prec_mean, (d1,1)) except: d1,k = util.extract_shape(prec_mean) assert(k == 1) d2,_ = util.extract_shape(prec) assert(d1==d2) if d is None: d = d1 else: assert(d==d1) super(MVGaussianNatural, self).__init__(d=d) self._prec_mean = prec_mean self._prec = prec self._L_prec = tf.cholesky(prec) self._entropy = bf.dists.multivariate_gaussian_entropy(L_prec=self._L_prec) # want to solve prec * mean = prec_mean for mean. # this is equiv to (LL') * mean = prec_mean. # since tf doesn't have a cholSolve shortcut, just # do it directly: # solve L y = prec_mean # to get y = (L' * mean), then # solve L' mean = y y = tf.matrix_triangular_solve(self._L_prec, self._prec_mean, lower=True, adjoint=False) self._mean = tf.matrix_triangular_solve(self._L_prec, y, lower=True, adjoint=True) L_cov_transpose = util.triangular_inv(self._L_prec) self._L_cov = tf.transpose(L_cov_transpose) self._cov = tf.matmul(self._L_cov, L_cov_transpose)
def _build_common_terms(self): num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) # size N x R Kdiag = self.kern.Kdiag(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Luu = tf.cholesky(Kuu) # => Luu Luu^T = Kuu V = tf.matrix_triangular_solve(Luu, Kuf) # => V^T V = Qff = Kuf^T Kuu^-1 Kuf diagQff = tf.reduce_sum(tf.square(V), 0) nu = Kdiag - diagQff + self.likelihood.variance B = tf.eye(num_inducing, dtype=settings.float_type) + tf.matmul(V / nu, V, transpose_b=True) L = tf.cholesky(B) beta = err / tf.expand_dims(nu, 1) # size N x R alpha = tf.matmul(V, beta) # size N x R gamma = tf.matrix_triangular_solve(L, alpha, lower=True) # size N x R return err, nu, Luu, L, alpha, beta, gamma