def test_multi_scale_inducing_equivalence_inducing_points(N, M, D): # Multiscale must be equivalent to inducing points when variance is zero Xnew, Z = np.random.randn(N, D), np.random.randn(M, D) rbf = gpflow.kernels.SquaredExponential(1.3441, lengthscale=np.random.uniform( 0.5, 3., D)) inducing_variable_zero_lengthscale = Multiscale(Z, scales=np.zeros(Z.shape)) inducing_variable_inducing_point = InducingPoints(Z) multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscale, rbf, Xnew) inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew) deviation_percent_Kuf = np.max( np.abs(multi_scale_Kuf - inducing_point_Kuf) / inducing_point_Kuf * 100) assert deviation_percent_Kuf < 0.1 multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscale, rbf) inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf) deviation_percent_Kuu = np.max( np.abs(multi_scale_Kuu - inducing_point_Kuu) / inducing_point_Kuu * 100) assert deviation_percent_Kuu < 0.1
def upper_bound(self) -> tf.Tensor: """ Upper bound for the sparse GP regression marginal likelihood. Note that the same inducing points are used for calculating the upper bound, as are used for computing the likelihood approximation. This may not lead to the best upper bound. The upper bound can be tightened by optimising Z, just like the lower bound. This is especially important in FITC, as FITC is known to produce poor inducing point locations. An optimisable upper bound can be found in https://github.com/markvdw/gp_upper. The key reference is :: @misc{titsias_2014, title={Variational Inference for Gaussian and Determinantal Point Processes}, url={http://www2.aueb.gr/users/mtitsias/papers/titsiasNipsVar14.pdf}, publisher={Workshop on Advances in Variational Inference (NIPS 2014)}, author={Titsias, Michalis K.}, year={2014}, month={Dec} } The key quantity, the trace term, can be computed via >>> _, v = conditionals.conditional(X, model.inducing_variable.Z, model.kernel, ... np.zeros((len(model.inducing_variable), 1))) which computes each individual element of the trace term. """ X_data, Y_data = self.data num_data = to_default_float(tf.shape(Y_data)[0]) Kdiag = self.kernel(X_data, full_cov=False) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) kuf = Kuf(self.inducing_variable, self.kernel, X_data) I = tf.eye(tf.shape(kuu)[0], dtype=default_float()) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) AAT = tf.linalg.matmul(A, A, transpose_b=True) B = I + AAT / self.likelihood.variance LB = tf.linalg.cholesky(B) # Using the Trace bound, from Titsias' presentation c = tf.maximum(tf.reduce_sum(Kdiag) - tf.reduce_sum(tf.square(A)), 0) # Alternative bound on max eigenval: corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.math.log(2 * np.pi * self.likelihood.variance) logdet = -tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) LC = tf.linalg.cholesky(I + AAT / corrected_noise) v = tf.linalg.triangular_solve(LC, tf.linalg.matmul(A, Y_data) / corrected_noise, lower=True) quad = -0.5 * tf.reduce_sum(tf.square(Y_data)) / corrected_noise + 0.5 * tf.reduce_sum(tf.square(v)) return const + logdet + quad
def test_inducing_variables_psd_schur(input_dim, inducing_variable, kernel): # Conditional variance must be PSD. X = np.random.randn(5, input_dim) Kuf_values = Kuf(inducing_variable, kernel, X) Kuu_values = Kuu(inducing_variable, kernel, jitter=default_jitter()) Kff_values = kernel(X) Qff_values = Kuf_values.numpy().T @ np.linalg.solve(Kuu_values, Kuf_values) assert np.all(np.linalg.eig(Kff_values - Qff_values)[0] > 0.0)
def conditional_ND(self, X, full_cov=False): # X is [S,N,D] Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) Lmm = tf.linalg.cholesky(Kmm) Kmm_tiled = tf.tile(tf.expand_dims(Kmm, 0), (self.num_outputs, 1, 1)) Lmm_tiled = tf.tile(tf.expand_dims(Lmm, 0), (self.num_outputs, 1, 1)) Kmn = Kuf(self.inducing_points, self.kernel, X) # K(Z,X) # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function. mean = tf.matmul(A, self.q_mu, transpose_a=True) # [N] # [D_out,M,N] A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :] # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: # -k(Z,Z) SK = -Kmm_tiled # [D_out,M,M] if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T # [D_out,M,M] SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A_tiled) # [D_out,M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) # [D_out,N,N] # Knn = k(X,X) Knn = self.kernel.K(X) else: # Summing over dimension 1 --> sum variances due to other. # Is this legit? delta_cov = tf.reduce_sum(A_tiled * B, 1) #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B, # transpose_a=True)) # [D_out,N] Knn = self.kernel.K_diag(X) # [N] var = tf.expand_dims(Knn, 0) + delta_cov # [D_out,N] var = tf.transpose(var) return mean + self.mean_function(X), var
def _conditional_train( Xnew: tf.Tensor, inducing_variable: InducingVariables, kernel: Kernel, f: tf.Tensor, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False, ): """ Single-output GP conditional. The covariance matrices used to calculate the conditional have the following shape: - Kuu: [M, M] - Kuf: [M, N] - Kff: [N, N] Further reference ----------------- - See `gpflow.conditionals._conditional` (below) for a detailed explanation of conditional in the single-output case. - See the multiouput notebook for more information about the multiouput framework. Parameters ---------- :param Xnew: data matrix, size [N, D]. :param f: data matrix, [M, R] :param full_cov: return the covariance between the datapoints :param full_output_cov: return the covariance between the outputs. NOTE: as we are using a single-output kernel with repetitions these covariances will be zero. :param q_sqrt: matrix of standard-deviations or Cholesky matrices, size [M, R] or [R, M, M]. :param white: boolean of whether to use the whitened representation :return: - mean: [N, R] - variance: [N, R], [R, N, N], [N, R, R] or [N, R, N, R] Please see `gpflow.conditional._expand_independent_outputs` for more information about the shape of the variance, depending on `full_cov` and `full_output_cov`. """ Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] Kmn = Kuf(inducing_variable, kernel, Xnew) # [M, N] Knn = kernel.diag_tr() #uses optimzied function to calculate the covariances fmean, fvar = base_conditional( Kmn, Kmm, Knn, f, full_cov=full_cov, q_sqrt=q_sqrt, white=white ) # [N, R], [R, N, N] or [N, R] return fmean, expand_independent_outputs(fvar, full_cov, full_output_cov)
def test_multi_scale_inducing_equivalence_inducing_points(N, M, D): # Multiscale must be equivalent to inducing points when variance is zero Xnew, Z = np.random.randn(N, D), np.random.randn(M, D) rbf = gpflow.kernels.SquaredExponential(1.3441, lengthscales=np.random.uniform( 0.5, 3.0, D)) inducing_variable_zero_lengthscales = Multiscale(Z, scales=np.zeros(Z.shape) + 1e-10) inducing_variable_inducing_point = InducingPoints(Z) multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscales, rbf, Xnew) inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew) relative_error_Kuf = np.abs(multi_scale_Kuf - inducing_point_Kuf) / inducing_point_Kuf assert np.max(relative_error_Kuf) < 0.1e-2 # 0.1 % multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscales, rbf) inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf) relative_error_Kuu = np.abs(multi_scale_Kuu - inducing_point_Kuu) / inducing_point_Kuu assert np.max(relative_error_Kuu) < 0.1e-2 # 0.1 %
def conditional(self, X, full_cov=False): # X is [N,D] or [S*N,D] Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) #[M,M] Lmm = tf.linalg.cholesky(Kmm) Kmn = Kuf(self.inducing_points, self.kernel, X) #[M,N] # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) mean = tf.matmul(A, self.q_mu-self.mean_function(self.inducing_points.Z), transpose_a=True) # [N,1] I = tf.eye(self.num_inducing, dtype=default_float()) # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: SK = -Kmm if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A) #[M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A, B, transpose_a=True) # [N,N] Knn = self.kernel(X, full_cov=True, presliced=False) else: delta_cov = tf.reduce_sum(A * B, 0) Knn = self.kernel(X, full_cov=False, presliced=False) var = Knn + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var
def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ X_data, Y_data = self.data num_inducing = len(self.inducing_variable) num_data = to_default_float(tf.shape(Y_data)[0]) output_dim = to_default_float(tf.shape(Y_data)[1]) err = Y_data - self.mean_function(X_data) Kdiag = self.kernel(X_data, full_cov=False) kuf = Kuf(self.inducing_variable, self.kernel, X_data) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) L = tf.linalg.cholesky(kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma AAT = tf.linalg.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma trace_term = 0.5 * output_dim * tf.reduce_sum(Kdiag) / self.likelihood.variance trace_term -= 0.5 * output_dim * tf.reduce_sum(tf.linalg.diag_part(AAT)) # tr(Kff - Qff) should be positive, numerical issues can arise here assert trace_term > 0.0, f"Trace term negative, should be positive ({trace_term:.4e})." # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += tf.negative(output_dim) * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.math.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound -= trace_term return bound
def predict_f(self, Xnew: InputData, full_cov=False, full_output_cov=False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ X_data, Y_data = self.data num_inducing = len(self.inducing_variable) err = Y_data - self.mean_function(X_data) kuf = Kuf(self.inducing_variable, self.kernel, X_data) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) Kus = Kuf(self.inducing_variable, self.kernel, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma B = tf.linalg.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True) mean = tf.linalg.matmul(tmp2, c, transpose_a=True) if full_cov: var = ( self.kernel(Xnew) + tf.linalg.matmul(tmp2, tmp2, transpose_a=True) - tf.linalg.matmul(tmp1, tmp1, transpose_a=True) ) var = tf.tile(var[None, ...], [self.num_latent_gps, 1, 1]) # [P, N, N] else: var = ( self.kernel(Xnew, full_cov=False) + tf.reduce_sum(tf.square(tmp2), 0) - tf.reduce_sum(tf.square(tmp1), 0) ) var = tf.tile(var[:, None], [1, self.num_latent_gps]) return mean + self.mean_function(Xnew), var
def test_inducing_equivalence(N, kernel): # Inducing inducing must be the same as the kernel evaluations Z = np.random.randn(N, 5) inducing_variable = InducingPoints(Z) assert_allclose(Kuu(inducing_variable, kernel), kernel(Z))
def conditional_ND(self, X, full_cov=False): # X is [S,N,D] Kmm_tiled = tf.convert_to_tensor([ Kuu(self.inducing_points[i], self.kernels[i], jitter=default_jitter()) for i in range(self.num_outputs) ]) Lmm_tiled = tf.convert_to_tensor([ tf.linalg.cholesky(Kmm_tiled[i]) for i in range(self.num_outputs) ]) A_tiled = [] mean_tiled = [] for i in range(self.num_outputs): Kmn = Kuf(self.inducing_points[i], self.kernels[i], X) Lmm = Lmm_tiled[i] A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function. mean = tf.linalg.matvec(A, self.q_mu[:, i], transpose_a=True) # [N] A_tiled.append(A) mean_tiled.append(mean) A_tiled = tf.convert_to_tensor(A_tiled) mean_tiled = tf.transpose(tf.convert_to_tensor(mean_tiled)) I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :] # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: # -k(Z,Z) SK = -Kmm_tiled # [D_out,M,M] if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T # [D_out,M,M] SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A_tiled) # [D_out,M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) # [D_out,N,N] # Knn = k(X,X) Knn = tf.convert_to_tensor( [self.kernels[i].K(X) for i in range(self.num_outputs)]) else: # Summing over dimension 1 --> sum variances due to other. # Is this legit? delta_cov = tf.reduce_sum(A_tiled * B, 1) #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B, # transpose_a=True)) # [D_out,N] Knn = tf.convert_to_tensor( [self.kernels[i].K_diag(X) for i in range(self.num_outputs)]) var = Knn + delta_cov # [D_out,N] var = tf.transpose(var) return mean_tiled + self.mean_function(X), var
def _efficient_sample_matheron_rule( inducing_variable: InducingVariables, kernel: KernelWithFeatureDecomposition, q_mu: tf.Tensor, *, q_sqrt: Optional[TensorType] = None, whiten: bool = False, ) -> Sample: """ Implements the efficient sampling rule from :cite:t:`wilson2020efficiently` using the Matheron rule. To use this sampling scheme, the GP has to have a ``kernel`` of the :class:`KernelWithFeatureDecomposition` type . :param kernel: A kernel of the :class:`KernelWithFeatureDecomposition` type, which holds the covariance function and the kernel's features and coefficients. :param q_mu: A tensor with the shape ``[M, P]``. :param q_sqrt: A tensor with the shape ``[P, M, M]``. :param whiten: Determines the parameterisation of the inducing variables. If True, ``p(u) = N(0, I)``, otherwise ``p(u) = N(0, Kuu)``. .. note:: Currenly, only *whiten* equals ``False`` is supported. """ # TODO(VD): allow for both whiten=True and False, currently only support False. # Remember u = Luu v, with Kuu = Luu Luu^T and p(v) = N(0, I) # so that p(u) = N(0, Luu Luu^T) = N(0, Kuu). assert not whiten, "Currently only whiten=False is supported" L = tf.shape( kernel.feature_coefficients)[0] # num eigenfunctions # noqa: F841 prior_weights = tf.sqrt(kernel.feature_coefficients) * tf.random.normal( tf.shape(kernel.feature_coefficients), dtype=default_float()) # [L, 1] M, P = tf.shape(q_mu)[0], tf.shape(q_mu)[ 1] # num inducing, num output heads u_sample_noise = tf.matmul( q_sqrt, tf.random.normal((P, M, 1), dtype=default_float()), # [P, M, M] # [P, M, 1] ) # [P, M, 1] u_sample = q_mu + tf.linalg.matrix_transpose(u_sample_noise[..., 0]) # [M, P] Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] tf.debugging.assert_equal(tf.shape(Kmm), [M, M]) phi_Z = kernel.feature_functions(inducing_variable.Z) # [M, L] weight_space_prior_Z = phi_Z @ prior_weights # [M, 1] diff = u_sample - weight_space_prior_Z # [M, P] -- using implicit broadcasting v = compute_A_inv_b(Kmm, diff) # [M, P] tf.debugging.assert_equal(tf.shape(v), [M, P]) class WilsonSample(Sample): def __call__(self, X: TensorType) -> tf.Tensor: """ :param X: evaluation points [N, D] :return: function value of sample [N, P] """ N = tf.shape(X)[0] phi_X = kernel.feature_functions(X) # [N, L] weight_space_prior_X = phi_X @ prior_weights # [N, 1] Knm = tf.linalg.matrix_transpose(Kuf(inducing_variable, kernel, X)) # [N, M] function_space_update_X = Knm @ v # [N, P] tf.debugging.assert_equal(tf.shape(weight_space_prior_X), [N, 1]) tf.debugging.assert_equal(tf.shape(function_space_update_X), [N, P]) return weight_space_prior_X + function_space_update_X # [N, P] return WilsonSample()
def local_spectrum_approx_conditional_ldf( Xnew, inducing_variable, kernel, f, *, alpha=None, full_cov=False, full_output_cov=False, q_sqrt=None, white=True, ): """ - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d) - inducing_variable is an instance of inducing_variables.InducingVariable that provides `Kuu` and `Kuf` methods for Laplacian Dirichlet features, this contains the limits of the bounding box and the frequencies the high frequency components not selected in inducing_variable. - f is the value (or mean value) of the features (i.e. the weights) - q_sqrt (default None) is the Cholesky factor of the uncertainty about f (to be propagated through the conditional as per the GPflow inducing-point implementation) - white (defaults False) specifies whether the whitening has been applied. LDF works a lot better, when using vanilla gradients, if whitening has been applied, so it's the default option. Given the GP represented by the inducing points specified in `inducing_variable`, produce the mean and (co-)variance of the GP at the points Xnew. Xnew :: N x D Kuu :: M x M Kuf :: M x N f :: M x K, K = 1 q_sqrt :: K x M x M, with K = 1 """ if full_output_cov: raise NotImplementedError # num_data = tf.shape(Xnew)[0] # M num_func = tf.shape(f)[1] # K Λ = Kuu(inducing_variable, kernel) # this is now a LinearOperator Φ = Kuf(inducing_variable, kernel, Xnew, alpha=alpha) # a complex Tensor Λr = Kuu(inducing_variable.remainder, kernel) Φr = Kuf(inducing_variable.remainder, kernel, Xnew, alpha=alpha) Φrm = Kuf(inducing_variable.remainder, kernel, -Xnew, alpha=alpha) # compute the covariance due to the conditioning if full_cov: Λr_inv_Φr = tf.expand_dims( tf.complex(real=1 / Λr.diag_part(), imag=tf.zeros_like(Λr.diag_part())), -1) * Φr Λr_inv_Φrm = tf.expand_dims( tf.complex(real=1 / Λr.diag_part(), imag=tf.zeros_like(Λr.diag_part())), -1) * Φrm a = tf.matmul(Φr, Λr_inv_Φr, adjoint_a=True) + tf.matmul( Φr, Λr_inv_Φrm, adjoint_a=True) b = tf.matmul(Φr, Λr_inv_Φr, adjoint_a=True) - tf.matmul( Φr, Λr_inv_Φrm, adjoint_a=True) fvar_rr = tf.math.real(a) fvar_ii = tf.math.real(b) fvar_ir = tf.math.imag(a) fvar_ri = tf.math.imag(-b) fvar = tf.concat([ tf.concat([fvar_rr, fvar_ir], -2), tf.concat([fvar_ri, fvar_ii], -2) ], -1) # K x 2N x 2N shape = (num_func, 1, 1) else: # ... x M x N -> ... x N Λr_inv_Φr = tf.expand_dims( tf.complex(real=1 / Λr.diag_part(), imag=tf.zeros_like(Λr.diag_part())), -1) * Φr Λr_inv_Φrm = tf.expand_dims( tf.complex(real=1 / Λr.diag_part(), imag=tf.zeros_like(Λr.diag_part())), -1) * Φrm fvar_rr = tf.reduce_sum(Φr * Λr_inv_Φr, -2) + tf.reduce_sum( Φr * Λr_inv_Φrm, -2) fvar_ii = tf.reduce_sum(Φr * Λr_inv_Φr, -2) - tf.reduce_sum( Φr * Λr_inv_Φrm, -2) fvar = tf.concat([tf.math.real(fvar_rr), tf.math.real(fvar_ii)], -1) # K x 2N x D shape = (num_func, 1) fvar = tf.expand_dims(fvar, 0) * tf.ones( shape, dtype=gpflow.default_float()) # K x N x N or K x N # another backsubstitution in the unwhitened case if white: A = Λ.cholesky().solve(tf.math.real(Φ)) B = Λ.cholesky().solve(tf.math.imag(Φ)) else: A = Λ.solve(tf.math.real(Φ)) B = Λ.solve(tf.math.imag(Φ)) # construct the conditional mean fmean = tf.concat( [tf.matmul(A, f, transpose_a=True), tf.matmul(B, f, transpose_a=True)], -2) if q_sqrt is not None: if q_sqrt.shape.ndims == 2: # case for q_diag = True LTA1 = Diag(tf.linalg.matrix_transpose(q_sqrt)) @ A # K x M x N LTA2 = Diag(tf.linalg.matrix_transpose(q_sqrt)) @ B elif q_sqrt.shape.ndims == 3: LTA1 = tf.matmul(q_sqrt, A, transpose_a=True) LTA2 = tf.matmul(q_sqrt, B, transpose_a=True) else: raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: LTA = tf.concat([LTA1, LTA2], -1) fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x 2N x 2N else: LTA = tf.concat([LTA1, LTA2], -1) # K x M x 2N fvar = fvar + tf.reduce_sum(tf.square(LTA), -2) # K x 2N fvar = tf.transpose(fvar) # 2N x K or 2N x 2N x K return fmean, fvar