Пример #1
0
def test_multi_scale_inducing_equivalence_inducing_points(N, M, D):
    # Multiscale must be equivalent to inducing points when variance is zero
    Xnew, Z = np.random.randn(N, D), np.random.randn(M, D)
    rbf = gpflow.kernels.SquaredExponential(1.3441,
                                            lengthscale=np.random.uniform(
                                                0.5, 3., D))
    inducing_variable_zero_lengthscale = Multiscale(Z,
                                                    scales=np.zeros(Z.shape))
    inducing_variable_inducing_point = InducingPoints(Z)

    multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscale, rbf, Xnew)
    inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew)

    deviation_percent_Kuf = np.max(
        np.abs(multi_scale_Kuf - inducing_point_Kuf) / inducing_point_Kuf *
        100)
    assert deviation_percent_Kuf < 0.1

    multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscale, rbf)
    inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf)

    deviation_percent_Kuu = np.max(
        np.abs(multi_scale_Kuu - inducing_point_Kuu) / inducing_point_Kuu *
        100)
    assert deviation_percent_Kuu < 0.1
Пример #2
0
    def upper_bound(self) -> tf.Tensor:
        """
        Upper bound for the sparse GP regression marginal likelihood.  Note that
        the same inducing points are used for calculating the upper bound, as are
        used for computing the likelihood approximation. This may not lead to the
        best upper bound. The upper bound can be tightened by optimising Z, just
        like the lower bound. This is especially important in FITC, as FITC is
        known to produce poor inducing point locations. An optimisable upper bound
        can be found in https://github.com/markvdw/gp_upper.

        The key reference is

        ::

          @misc{titsias_2014,
            title={Variational Inference for Gaussian and Determinantal Point Processes},
            url={http://www2.aueb.gr/users/mtitsias/papers/titsiasNipsVar14.pdf},
            publisher={Workshop on Advances in Variational Inference (NIPS 2014)},
            author={Titsias, Michalis K.},
            year={2014},
            month={Dec}
          }

        The key quantity, the trace term, can be computed via

        >>> _, v = conditionals.conditional(X, model.inducing_variable.Z, model.kernel,
        ...                                 np.zeros((len(model.inducing_variable), 1)))

        which computes each individual element of the trace term.
        """
        X_data, Y_data = self.data
        num_data = to_default_float(tf.shape(Y_data)[0])

        Kdiag = self.kernel(X_data, full_cov=False)
        kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance)
        kuf = Kuf(self.inducing_variable, self.kernel, X_data)

        I = tf.eye(tf.shape(kuu)[0], dtype=default_float())

        L = tf.linalg.cholesky(kuu)
        A = tf.linalg.triangular_solve(L, kuf, lower=True)
        AAT = tf.linalg.matmul(A, A, transpose_b=True)
        B = I + AAT / self.likelihood.variance
        LB = tf.linalg.cholesky(B)

        # Using the Trace bound, from Titsias' presentation
        c = tf.maximum(tf.reduce_sum(Kdiag) - tf.reduce_sum(tf.square(A)), 0)

        # Alternative bound on max eigenval:
        corrected_noise = self.likelihood.variance + c

        const = -0.5 * num_data * tf.math.log(2 * np.pi * self.likelihood.variance)
        logdet = -tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB)))

        LC = tf.linalg.cholesky(I + AAT / corrected_noise)
        v = tf.linalg.triangular_solve(LC, tf.linalg.matmul(A, Y_data) / corrected_noise, lower=True)
        quad = -0.5 * tf.reduce_sum(tf.square(Y_data)) / corrected_noise + 0.5 * tf.reduce_sum(tf.square(v))

        return const + logdet + quad
Пример #3
0
def test_inducing_variables_psd_schur(input_dim, inducing_variable, kernel):
    # Conditional variance must be PSD.
    X = np.random.randn(5, input_dim)
    Kuf_values = Kuf(inducing_variable, kernel, X)
    Kuu_values = Kuu(inducing_variable, kernel, jitter=default_jitter())
    Kff_values = kernel(X)
    Qff_values = Kuf_values.numpy().T @ np.linalg.solve(Kuu_values, Kuf_values)
    assert np.all(np.linalg.eig(Kff_values - Qff_values)[0] > 0.0)
Пример #4
0
    def conditional_ND(self, X, full_cov=False):
        # X is [S,N,D]
        Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter())
        Lmm = tf.linalg.cholesky(Kmm)
        Kmm_tiled = tf.tile(tf.expand_dims(Kmm, 0), (self.num_outputs, 1, 1))
        Lmm_tiled = tf.tile(tf.expand_dims(Lmm, 0), (self.num_outputs, 1, 1))

        Kmn = Kuf(self.inducing_points, self.kernel, X)  # K(Z,X)
        # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X)
        A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True)  # L^{-1}k(Z,X)
        if not self.white:
            # L^{-T}L^{-1}K(Z,X) is [M,N]
            A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False)

        # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function.
        mean = tf.matmul(A, self.q_mu, transpose_a=True)  # [N]

        # [D_out,M,N]
        A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1])
        I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :]

        # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X)
        if self.white:
            SK = -I
        else:
            # -k(Z,Z)
            SK = -Kmm_tiled  # [D_out,M,M]

        if self.q_sqrt is not None:
            # SK = -k(Z,Z) + q_sqrtq_sqrt^T
            # [D_out,M,M]
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
        B = tf.matmul(SK, A_tiled)  # [D_out,M,N]

        if full_cov:
            # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)  # [D_out,N,N]
            # Knn = k(X,X)
            Knn = self.kernel.K(X)
        else:
            # Summing over dimension 1 --> sum variances due to other.
            # Is this legit?
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B,
            #    transpose_a=True)) # [D_out,N]
            Knn = self.kernel.K_diag(X)  # [N]

        var = tf.expand_dims(Knn, 0) + delta_cov  # [D_out,N]
        var = tf.transpose(var)

        return mean + self.mean_function(X), var
Пример #5
0
def _conditional_train(
    Xnew: tf.Tensor,
    inducing_variable: InducingVariables,
    kernel: Kernel,
    f: tf.Tensor,
    *,
    full_cov=False,
    full_output_cov=False,
    q_sqrt=None,
    white=False,
):
    """
    Single-output GP conditional.

    The covariance matrices used to calculate the conditional have the following shape:
    - Kuu: [M, M]
    - Kuf: [M, N]
    - Kff: [N, N]

    Further reference
    -----------------
    - See `gpflow.conditionals._conditional` (below) for a detailed explanation of
      conditional in the single-output case.
    - See the multiouput notebook for more information about the multiouput framework.

    Parameters
    ----------
    :param Xnew: data matrix, size [N, D].
    :param f: data matrix, [M, R]
    :param full_cov: return the covariance between the datapoints
    :param full_output_cov: return the covariance between the outputs.
           NOTE: as we are using a single-output kernel with repetitions
                 these covariances will be zero.
    :param q_sqrt: matrix of standard-deviations or Cholesky matrices,
        size [M, R] or [R, M, M].
    :param white: boolean of whether to use the whitened representation
    :return:
        - mean:     [N, R]
        - variance: [N, R], [R, N, N], [N, R, R] or [N, R, N, R]
        Please see `gpflow.conditional._expand_independent_outputs` for more information
        about the shape of the variance, depending on `full_cov` and `full_output_cov`.
    """
    Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter())  # [M, M]
    Kmn = Kuf(inducing_variable, kernel, Xnew)  # [M, N]
    Knn = kernel.diag_tr() #uses optimzied function to calculate the covariances
    fmean, fvar = base_conditional(
        Kmn, Kmm, Knn, f, full_cov=full_cov, q_sqrt=q_sqrt, white=white
    )  # [N, R],  [R, N, N] or [N, R]
    return fmean, expand_independent_outputs(fvar, full_cov, full_output_cov)
Пример #6
0
def test_multi_scale_inducing_equivalence_inducing_points(N, M, D):
    # Multiscale must be equivalent to inducing points when variance is zero
    Xnew, Z = np.random.randn(N, D), np.random.randn(M, D)
    rbf = gpflow.kernels.SquaredExponential(1.3441,
                                            lengthscales=np.random.uniform(
                                                0.5, 3.0, D))
    inducing_variable_zero_lengthscales = Multiscale(Z,
                                                     scales=np.zeros(Z.shape) +
                                                     1e-10)
    inducing_variable_inducing_point = InducingPoints(Z)

    multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscales, rbf, Xnew)
    inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew)

    relative_error_Kuf = np.abs(multi_scale_Kuf -
                                inducing_point_Kuf) / inducing_point_Kuf
    assert np.max(relative_error_Kuf) < 0.1e-2  # 0.1 %

    multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscales, rbf)
    inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf)

    relative_error_Kuu = np.abs(multi_scale_Kuu -
                                inducing_point_Kuu) / inducing_point_Kuu
    assert np.max(relative_error_Kuu) < 0.1e-2  # 0.1 %
Пример #7
0
    def conditional(self, X, full_cov=False):
        # X is [N,D] or [S*N,D]
        
        Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) #[M,M]
        Lmm = tf.linalg.cholesky(Kmm)
        Kmn = Kuf(self.inducing_points, self.kernel, X) #[M,N]
        
        # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X)
        A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X)
        if not self.white:
            # L^{-T}L^{-1}K(Z,X) is [M,N]
            A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False)
        
        # m = alpha(X)^T(q_mu - m(Z))
        mean = tf.matmul(A, self.q_mu-self.mean_function(self.inducing_points.Z), 
                         transpose_a=True) # [N,1]
        
        I = tf.eye(self.num_inducing, dtype=default_float())
       
        # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X)
        if self.white: SK = -I
        else: SK = -Kmm 

        if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) 
        
        # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
        B = tf.matmul(SK, A) #[M,N]

        if full_cov:
            # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
            delta_cov = tf.matmul(A, B, transpose_a=True) # [N,N]
            Knn = self.kernel(X, full_cov=True, presliced=False)
        else:
            delta_cov = tf.reduce_sum(A * B, 0)
            Knn = self.kernel(X, full_cov=False, presliced=False)
       
        var = Knn + delta_cov
        var = tf.transpose(var)
        
        return mean + self.mean_function(X), var
Пример #8
0
    def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood. For a derivation of the terms in here, see the associated
        SGPR notebook.
        """
        X_data, Y_data = self.data

        num_inducing = len(self.inducing_variable)
        num_data = to_default_float(tf.shape(Y_data)[0])
        output_dim = to_default_float(tf.shape(Y_data)[1])

        err = Y_data - self.mean_function(X_data)
        Kdiag = self.kernel(X_data, full_cov=False)
        kuf = Kuf(self.inducing_variable, self.kernel, X_data)
        kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance)
        L = tf.linalg.cholesky(kuu)
        sigma = tf.sqrt(self.likelihood.variance)

        # Compute intermediate matrices
        A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma
        AAT = tf.linalg.matmul(A, A, transpose_b=True)
        B = AAT + tf.eye(num_inducing, dtype=default_float())
        LB = tf.linalg.cholesky(B)
        Aerr = tf.linalg.matmul(A, err)
        c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma
        trace_term = 0.5 * output_dim * tf.reduce_sum(Kdiag) / self.likelihood.variance
        trace_term -= 0.5 * output_dim * tf.reduce_sum(tf.linalg.diag_part(AAT))

        # tr(Kff - Qff) should be positive, numerical issues can arise here
        assert trace_term > 0.0, f"Trace term negative, should be positive ({trace_term:.4e})."

        # compute log marginal bound
        bound = -0.5 * num_data * output_dim * np.log(2 * np.pi)
        bound += tf.negative(output_dim) * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB)))
        bound -= 0.5 * num_data * output_dim * tf.math.log(self.likelihood.variance)
        bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound -= trace_term

        return bound
Пример #9
0
 def predict_f(self, Xnew: InputData, full_cov=False, full_output_cov=False) -> MeanAndVariance:
     """
     Compute the mean and variance of the latent function at some new points
     Xnew. For a derivation of the terms in here, see the associated SGPR
     notebook.
     """
     X_data, Y_data = self.data
     num_inducing = len(self.inducing_variable)
     err = Y_data - self.mean_function(X_data)
     kuf = Kuf(self.inducing_variable, self.kernel, X_data)
     kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance)
     Kus = Kuf(self.inducing_variable, self.kernel, Xnew)
     sigma = tf.sqrt(self.likelihood.variance)
     L = tf.linalg.cholesky(kuu)
     A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma
     B = tf.linalg.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=default_float())
     LB = tf.linalg.cholesky(B)
     Aerr = tf.linalg.matmul(A, err)
     c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma
     tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True)
     tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True)
     mean = tf.linalg.matmul(tmp2, c, transpose_a=True)
     if full_cov:
         var = (
             self.kernel(Xnew)
             + tf.linalg.matmul(tmp2, tmp2, transpose_a=True)
             - tf.linalg.matmul(tmp1, tmp1, transpose_a=True)
         )
         var = tf.tile(var[None, ...], [self.num_latent_gps, 1, 1])  # [P, N, N]
     else:
         var = (
             self.kernel(Xnew, full_cov=False)
             + tf.reduce_sum(tf.square(tmp2), 0)
             - tf.reduce_sum(tf.square(tmp1), 0)
         )
         var = tf.tile(var[:, None], [1, self.num_latent_gps])
     return mean + self.mean_function(Xnew), var
Пример #10
0
def test_inducing_equivalence(N, kernel):
    # Inducing inducing must be the same as the kernel evaluations
    Z = np.random.randn(N, 5)
    inducing_variable = InducingPoints(Z)
    assert_allclose(Kuu(inducing_variable, kernel), kernel(Z))
Пример #11
0
    def conditional_ND(self, X, full_cov=False):
        # X is [S,N,D]
        Kmm_tiled = tf.convert_to_tensor([
            Kuu(self.inducing_points[i],
                self.kernels[i],
                jitter=default_jitter()) for i in range(self.num_outputs)
        ])
        Lmm_tiled = tf.convert_to_tensor([
            tf.linalg.cholesky(Kmm_tiled[i]) for i in range(self.num_outputs)
        ])

        A_tiled = []
        mean_tiled = []
        for i in range(self.num_outputs):
            Kmn = Kuf(self.inducing_points[i], self.kernels[i], X)
            Lmm = Lmm_tiled[i]

            A = tf.linalg.triangular_solve(Lmm, Kmn,
                                           lower=True)  # L^{-1}k(Z,X)
            if not self.white:
                # L^{-T}L^{-1}K(Z,X) is [M,N]
                A = tf.linalg.triangular_solve(tf.transpose(Lmm),
                                               A,
                                               lower=False)

            # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function.
            mean = tf.linalg.matvec(A, self.q_mu[:, i],
                                    transpose_a=True)  # [N]

            A_tiled.append(A)
            mean_tiled.append(mean)

        A_tiled = tf.convert_to_tensor(A_tiled)
        mean_tiled = tf.transpose(tf.convert_to_tensor(mean_tiled))

        I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :]

        # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X)
        if self.white:
            SK = -I
        else:
            # -k(Z,Z)
            SK = -Kmm_tiled  # [D_out,M,M]

        if self.q_sqrt is not None:
            # SK = -k(Z,Z) + q_sqrtq_sqrt^T
            # [D_out,M,M]
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
        B = tf.matmul(SK, A_tiled)  # [D_out,M,N]

        if full_cov:
            # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)  # [D_out,N,N]
            # Knn = k(X,X)
            Knn = tf.convert_to_tensor(
                [self.kernels[i].K(X) for i in range(self.num_outputs)])
        else:
            # Summing over dimension 1 --> sum variances due to other.
            # Is this legit?
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B,
            #    transpose_a=True)) # [D_out,N]
            Knn = tf.convert_to_tensor(
                [self.kernels[i].K_diag(X) for i in range(self.num_outputs)])

        var = Knn + delta_cov  # [D_out,N]
        var = tf.transpose(var)

        return mean_tiled + self.mean_function(X), var
Пример #12
0
def _efficient_sample_matheron_rule(
    inducing_variable: InducingVariables,
    kernel: KernelWithFeatureDecomposition,
    q_mu: tf.Tensor,
    *,
    q_sqrt: Optional[TensorType] = None,
    whiten: bool = False,
) -> Sample:
    """
    Implements the efficient sampling rule from :cite:t:`wilson2020efficiently` using
    the Matheron rule. To use this sampling scheme, the GP has to have a
    ``kernel`` of the :class:`KernelWithFeatureDecomposition` type .

    :param kernel: A kernel of the :class:`KernelWithFeatureDecomposition` type, which
        holds the covariance function and the kernel's features and
        coefficients.
    :param q_mu: A tensor with the shape ``[M, P]``.
    :param q_sqrt: A tensor with the shape ``[P, M, M]``.
    :param whiten: Determines the parameterisation of the inducing variables.
        If True, ``p(u) = N(0, I)``, otherwise ``p(u) = N(0, Kuu)``.
        .. note:: Currenly, only *whiten* equals ``False`` is supported.
    """
    # TODO(VD): allow for both whiten=True and False, currently only support False.
    # Remember u = Luu v, with Kuu = Luu Luu^T and p(v) = N(0, I)
    # so that p(u) = N(0, Luu Luu^T) = N(0, Kuu).
    assert not whiten, "Currently only whiten=False is supported"
    L = tf.shape(
        kernel.feature_coefficients)[0]  # num eigenfunctions  # noqa: F841

    prior_weights = tf.sqrt(kernel.feature_coefficients) * tf.random.normal(
        tf.shape(kernel.feature_coefficients), dtype=default_float())  # [L, 1]

    M, P = tf.shape(q_mu)[0], tf.shape(q_mu)[
        1]  # num inducing, num output heads
    u_sample_noise = tf.matmul(
        q_sqrt,
        tf.random.normal((P, M, 1),
                         dtype=default_float()),  # [P, M, M]  # [P, M, 1]
    )  # [P, M, 1]
    u_sample = q_mu + tf.linalg.matrix_transpose(u_sample_noise[...,
                                                                0])  # [M, P]
    Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter())  # [M, M]
    tf.debugging.assert_equal(tf.shape(Kmm), [M, M])
    phi_Z = kernel.feature_functions(inducing_variable.Z)  # [M, L]
    weight_space_prior_Z = phi_Z @ prior_weights  # [M, 1]
    diff = u_sample - weight_space_prior_Z  # [M, P] -- using implicit broadcasting
    v = compute_A_inv_b(Kmm, diff)  # [M, P]
    tf.debugging.assert_equal(tf.shape(v), [M, P])

    class WilsonSample(Sample):
        def __call__(self, X: TensorType) -> tf.Tensor:
            """
            :param X: evaluation points [N, D]
            :return: function value of sample [N, P]
            """
            N = tf.shape(X)[0]
            phi_X = kernel.feature_functions(X)  # [N, L]
            weight_space_prior_X = phi_X @ prior_weights  # [N, 1]
            Knm = tf.linalg.matrix_transpose(Kuf(inducing_variable, kernel,
                                                 X))  # [N, M]
            function_space_update_X = Knm @ v  # [N, P]

            tf.debugging.assert_equal(tf.shape(weight_space_prior_X), [N, 1])
            tf.debugging.assert_equal(tf.shape(function_space_update_X),
                                      [N, P])

            return weight_space_prior_X + function_space_update_X  # [N, P]

    return WilsonSample()
def local_spectrum_approx_conditional_ldf(
    Xnew,
    inducing_variable,
    kernel,
    f,
    *,
    alpha=None,
    full_cov=False,
    full_output_cov=False,
    q_sqrt=None,
    white=True,
):
    """
     - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d)
     - inducing_variable is an instance of inducing_variables.InducingVariable that provides
       `Kuu` and `Kuf` methods for Laplacian Dirichlet features, this contains the limits of
       the bounding box and the frequencies
       the high frequency components not selected in inducing_variable.
     - f is the value (or mean value) of the features (i.e. the weights)
     - q_sqrt (default None) is the Cholesky factor of the uncertainty about f
       (to be propagated through the conditional as per the GPflow inducing-point implementation)
     - white (defaults False) specifies whether the whitening has been applied. LDF works a lot better,
         when using vanilla gradients, if whitening has been applied, so it's the default option.

    Given the GP represented by the inducing points specified in `inducing_variable`, produce the mean
    and (co-)variance of the GP at the points Xnew.

       Xnew :: N x D
       Kuu :: M x M
       Kuf :: M x N
       f :: M x K, K = 1
       q_sqrt :: K x M x M, with K = 1
    """
    if full_output_cov:
        raise NotImplementedError

    # num_data = tf.shape(Xnew)[0]  # M
    num_func = tf.shape(f)[1]  # K

    Λ = Kuu(inducing_variable, kernel)  # this is now a LinearOperator
    Φ = Kuf(inducing_variable, kernel, Xnew, alpha=alpha)  # a complex Tensor
    Λr = Kuu(inducing_variable.remainder, kernel)
    Φr = Kuf(inducing_variable.remainder, kernel, Xnew, alpha=alpha)
    Φrm = Kuf(inducing_variable.remainder, kernel, -Xnew, alpha=alpha)

    # compute the covariance due to the conditioning
    if full_cov:
        Λr_inv_Φr = tf.expand_dims(
            tf.complex(real=1 / Λr.diag_part(),
                       imag=tf.zeros_like(Λr.diag_part())), -1) * Φr
        Λr_inv_Φrm = tf.expand_dims(
            tf.complex(real=1 / Λr.diag_part(),
                       imag=tf.zeros_like(Λr.diag_part())), -1) * Φrm
        a = tf.matmul(Φr, Λr_inv_Φr, adjoint_a=True) + tf.matmul(
            Φr, Λr_inv_Φrm, adjoint_a=True)
        b = tf.matmul(Φr, Λr_inv_Φr, adjoint_a=True) - tf.matmul(
            Φr, Λr_inv_Φrm, adjoint_a=True)
        fvar_rr = tf.math.real(a)
        fvar_ii = tf.math.real(b)
        fvar_ir = tf.math.imag(a)
        fvar_ri = tf.math.imag(-b)
        fvar = tf.concat([
            tf.concat([fvar_rr, fvar_ir], -2),
            tf.concat([fvar_ri, fvar_ii], -2)
        ], -1)  # K x 2N x 2N
        shape = (num_func, 1, 1)
    else:
        # ... x M x N -> ... x N
        Λr_inv_Φr = tf.expand_dims(
            tf.complex(real=1 / Λr.diag_part(),
                       imag=tf.zeros_like(Λr.diag_part())), -1) * Φr
        Λr_inv_Φrm = tf.expand_dims(
            tf.complex(real=1 / Λr.diag_part(),
                       imag=tf.zeros_like(Λr.diag_part())), -1) * Φrm
        fvar_rr = tf.reduce_sum(Φr * Λr_inv_Φr, -2) + tf.reduce_sum(
            Φr * Λr_inv_Φrm, -2)
        fvar_ii = tf.reduce_sum(Φr * Λr_inv_Φr, -2) - tf.reduce_sum(
            Φr * Λr_inv_Φrm, -2)
        fvar = tf.concat([tf.math.real(fvar_rr),
                          tf.math.real(fvar_ii)], -1)  # K x 2N x D
        shape = (num_func, 1)
    fvar = tf.expand_dims(fvar, 0) * tf.ones(
        shape, dtype=gpflow.default_float())  # K x N x N or K x N
    # another backsubstitution in the unwhitened case
    if white:
        A = Λ.cholesky().solve(tf.math.real(Φ))
        B = Λ.cholesky().solve(tf.math.imag(Φ))
    else:
        A = Λ.solve(tf.math.real(Φ))
        B = Λ.solve(tf.math.imag(Φ))

    # construct the conditional mean
    fmean = tf.concat(
        [tf.matmul(A, f, transpose_a=True),
         tf.matmul(B, f, transpose_a=True)], -2)

    if q_sqrt is not None:
        if q_sqrt.shape.ndims == 2:
            # case for q_diag = True
            LTA1 = Diag(tf.linalg.matrix_transpose(q_sqrt)) @ A  # K x M x N
            LTA2 = Diag(tf.linalg.matrix_transpose(q_sqrt)) @ B
        elif q_sqrt.shape.ndims == 3:
            LTA1 = tf.matmul(q_sqrt, A, transpose_a=True)
            LTA2 = tf.matmul(q_sqrt, B, transpose_a=True)
        else:
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            LTA = tf.concat([LTA1, LTA2], -1)
            fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x 2N x 2N
        else:
            LTA = tf.concat([LTA1, LTA2], -1)  # K x M x 2N
            fvar = fvar + tf.reduce_sum(tf.square(LTA), -2)  # K x 2N
    fvar = tf.transpose(fvar)  # 2N x K or 2N x 2N x K

    return fmean, fvar