def build_predict(self, Xnew, full_cov=False): """ The posterior variance of F is given by q(f) = N(f | K alpha + mean, [K^-1 + diag(lambda**2)]^-1) Here we project this to F*, the values of the GP at Xnew which is given by q(F*) = N ( F* | K_{*F} alpha + mean, K_{**} - K_{*f}[K_{ff} + diag(lambda**-2)]^-1 K_{f*} ) """ # compute kernel things Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) # predictive mean f_mean = tf.matmul(tf.transpose(Kx), self.q_alpha) + self.mean_function(Xnew) # predictive var A = K + tf.batch_matrix_diag(tf.transpose(1./tf.square(self.q_lambda))) L = tf.batch_cholesky(A) Kx_tiled = tf.tile(tf.expand_dims(Kx, 0), [self.num_latent, 1, 1]) LiKx = tf.batch_matrix_triangular_solve(L, Kx_tiled) if full_cov: f_var = self.kern.K(Xnew) - tf.batch_matmul(LiKx, LiKx, adj_x=True) else: f_var = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(LiKx), 1) return f_mean, tf.transpose(f_var)
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], tf.float64) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square( tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), tf.float64) # constant term Lq = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5 * tf.reduce_sum(tf.log(tf.square( tf.batch_matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.batch_matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None): for np_type in [np.float32, np.float64]: a = x.astype(np_type) b = y.astype(np_type) # For numpy.solve we have to explicitly zero out the strictly # upper or lower triangle. if lower and a.size > 0: a_np = np.tril(a) elif a.size > 0: a_np = np.triu(a) else: a_np = a if adjoint: a_np = np.conj(np.transpose(a_np)) if batch_dims is not None: a = np.tile(a, batch_dims + [1, 1]) a_np = np.tile(a_np, batch_dims + [1, 1]) b = np.tile(b, batch_dims + [1, 1]) with self.test_session(): if a.ndim == 2: tf_ans = tf.matrix_triangular_solve( a, b, lower=lower, adjoint=adjoint).eval() else: tf_ans = tf.batch_matrix_triangular_solve( a, b, lower=lower, adjoint=adjoint).eval() np_ans = np.linalg.solve(a_np, b) self.assertEqual(np_ans.shape, tf_ans.shape) self.assertAllClose(np_ans, tf_ans)
def _verifySolve(self, x, y, lower=True, adjoint=False, batch_dims=None): for np_type in [np.float32, np.float64]: a = x.astype(np_type) b = y.astype(np_type) # For numpy.solve we have to explicitly zero out the strictly # upper or lower triangle. if lower and a.size > 0: a_np = np.tril(a) elif a.size > 0: a_np = np.triu(a) else: a_np = a if adjoint: a_np = np.conj(np.transpose(a_np)) if batch_dims is not None: a = np.tile(a, batch_dims + [1, 1]) a_np = np.tile(a_np, batch_dims + [1, 1]) b = np.tile(b, batch_dims + [1, 1]) with self.test_session(): if a.ndim == 2: tf_ans = tf.matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint).eval() else: tf_ans = tf.batch_matrix_triangular_solve(a, b, lower=lower, adjoint=adjoint).eval() np_ans = np.linalg.solve(a_np, b) self.assertEqual(np_ans.shape, tf_ans.shape) self.assertAllClose(np_ans, tf_ans)
def _verifySolve(self, x, y, lower=True): for np_type in [np.float32, np.float64]: a = x.astype(np_type) b = y.astype(np_type) with self.test_session(): if a.ndim == 2: tf_ans = tf.matrix_triangular_solve(a, b, lower=lower) else: tf_ans = tf.batch_matrix_triangular_solve(a, b, lower=lower) out = tf_ans.eval() if lower: np_ans = np.linalg.solve(np.tril(a), b) else: np_ans = np.linalg.solve(np.triu(a), b) self.assertEqual(np_ans.shape, out.shape) self.assertAllClose(np_ans, out)
def _define_full_covariance_probs(self, shard_id, shard): """Defines the full covariance probabilties per example in a class. Updates a matrix with dimension num_examples X num_classes. Args: shard_id: id of the current shard. shard: current data shard, 1 X num_examples X dimensions. """ diff = shard - self._means cholesky = tf.batch_cholesky(self._covs + self._min_var) log_det_covs = 2.0 * tf.reduce_sum(tf.log( tf.batch_matrix_diag_part(cholesky)), 1) x_mu_cov = tf.square(tf.batch_matrix_triangular_solve( cholesky, tf.transpose(diff, perm=[0, 2, 1]), lower=True)) diag_m = tf.transpose(tf.reduce_sum(x_mu_cov, 1)) self._probs[shard_id] = -0.5 * ( diag_m + tf.to_float(self._dimensions) * tf.log(2 * np.pi) + log_det_covs)
def _define_full_covariance_probs(self, shard_id, shard): """Defines the full covariance probabilties per example in a class. Updates a matrix with dimension num_examples X num_classes. Args: shard_id: id of the current shard. shard: current data shard, 1 X num_examples X dimensions. """ diff = shard - self._means cholesky = tf.batch_cholesky(self._covs + self._min_var) log_det_covs = 2.0 * tf.reduce_sum( tf.log(tf.batch_matrix_diag_part(cholesky)), 1) x_mu_cov = tf.square( tf.batch_matrix_triangular_solve(cholesky, tf.transpose(diff, perm=[0, 2, 1]), lower=True)) diag_m = tf.transpose(tf.reduce_sum(x_mu_cov, 1)) self._probs[shard_id] = -0.5 * ( diag_m + tf.to_float(self._dimensions) * tf.log(2 * np.pi) + log_det_covs)
def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(eye(self.num_data), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.batch_cholesky(A) Li = tf.batch_matrix_triangular_solve(L, I) tmp = Li / tf.transpose(self.q_lambda) f_var = 1./tf.square(self.q_lambda) - tf.transpose(tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.batch_matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha*self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def _batch_sqrt_solve(self, rhs): return tf.batch_matrix_triangular_solve(self._chol, rhs, lower=True)
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size n x D - X are data points, size m x D - kern is a GPinv kernel - f is a data matrix, m x R, representing the function values at X, for R functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size m x R or m x m x R - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = tf.transpose(kern.K(X, Xnew), [2, 0, 1]) # [R,n,n2] Lm = tf.transpose(kern.Cholesky(X), [2, 0, 1]) # [R,n,n] # Compute the projection matrix A A = tf.batch_matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: # shape [R,n,n] fvar = tf.transpose(kern.K(Xnew), [2, 0, 1]) - tf.matmul( A, A, transpose_a=True) else: # shape [R,n] fvar = tf.transpose(kern.Kdiag(Xnew)) - tf.reduce_sum(tf.square(A), 1) # another backsubstitution in the unwhitened case if not whiten: A = tf.batch_matrix_triangular_solve(tf.transpose(Lm, [0, 2, 1]), A, lower=False) # change shape of f [m,R] -> [R,m,1] f = tf.expand_dims(tf.transpose(f), -1) # construct the conditional mean, sized [m,R] fmean = tf.transpose( tf.squeeze(tf.batch_matmul(tf.transpose(A, [0, 2, 1]), f), [-1])) if q_sqrt is not None: # diagonal case. if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # R x m x n # full cov case elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M LTA = tf.batch_matmul(L, A, adj_x=True) # R x m x n else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.batch_matmul(LTA, LTA, adj_x=True) # R x n x n else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x n fvar = tf.transpose(fvar) # n x R or n x n x R return fmean, fvar