Пример #1
0
    def predict_f(self,
                  Xnew: InputData,
                  full_cov: bool = False,
                  full_output_cov: bool = False) -> MeanAndVariance:
        """
        Compute the mean and variance of the latent function at some new points.
        Note that this is very similar to the SGPR prediction, for which
        there are notes in the SGPR notebook.

        Note: This model does not allow full output covariances.

        :param Xnew: points at which to predict
        """
        if full_output_cov:
            raise NotImplementedError

        pX = DiagonalGaussian(self.X_data_mean, self.X_data_var)

        Y_data = self.data
        num_inducing = self.inducing_variable.num_inducing
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0,
        )
        jitter = default_jitter()
        Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)
        L = tf.linalg.cholesky(
            covariances.Kuu(self.inducing_variable, self.kernel,
                            jitter=jitter))

        A = tf.linalg.triangular_solve(L, tf.transpose(psi1),
                                       lower=True) / sigma
        tmp = tf.linalg.triangular_solve(L, psi2, lower=True)
        AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp),
                                         lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=default_float())
        LB = tf.linalg.cholesky(B)
        c = tf.linalg.triangular_solve(
            LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma
        tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True)
        tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True)
        mean = tf.linalg.matmul(tmp2, c, transpose_a=True)
        if full_cov:
            var = (self.kernel(Xnew) +
                   tf.linalg.matmul(tmp2, tmp2, transpose_a=True) -
                   tf.linalg.matmul(tmp1, tmp1, transpose_a=True))
            shape = tf.stack([1, 1, tf.shape(Y_data)[1]])
            var = tf.tile(tf.expand_dims(var, 2), shape)
        else:
            var = (self.kernel(Xnew, full_cov=False) +
                   tf.reduce_sum(tf.square(tmp2), axis=0) -
                   tf.reduce_sum(tf.square(tmp1), axis=0))
            shape = tf.stack([1, tf.shape(Y_data)[1]])
            var = tf.tile(tf.expand_dims(var, 1), shape)
        return mean + self.mean_function(Xnew), var
Пример #2
0
    def compute_qu(self, full_cov: bool = True) -> Tuple[tf.Tensor, tf.Tensor]:
        """
        Computes the mean and variance of q(u) = N(mu, cov), the variational distribution on
        inducing outputs. SVGP with this q(u) should predict identically to
        SGPR.
        The derivation is at follows:
        q(u)=N(u | m, S)
        with:
        S=Kuu^{-1}+ [Kuu^{-1}* Kuf * Kfu * Kuu^{-1} * beta]
        m=S^{-1} Kuu^{-1} Kuf y beta

        were sigma^-2 = beta
        
        :return: mu, cov
        """

        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        # num_inducing = self.inducing_variable.num_inducing

        #E_qx[Kfu]
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        #E_qx[Kuf@Kfu]
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0)

        kuu = covariances.Kuu(self.inducing_variable,
                              self.kernel,
                              jitter=default_jitter())
        kuf = tf.transpose(psi1)

        sig = kuu + psi2 * (self.likelihood.variance**-1)
        sig_sqrt = tf.linalg.cholesky(sig)

        sig_sqrt_kuu = tf.linalg.triangular_solve(sig_sqrt, kuu)
        # [M,M] -> [M(M +1)//2] =/= [M,D]

        cov = tf.linalg.matmul(sig_sqrt_kuu, sig_sqrt_kuu, transpose_a=True)

        err = Y_data - self.mean_function(X_data_mean)

        mu = (tf.linalg.matmul(sig_sqrt_kuu,
                               tf.linalg.triangular_solve(
                                   sig_sqrt, tf.linalg.matmul(kuf, err)),
                               transpose_a=True) / self.likelihood.variance)
        if not full_cov:
            return mu, cov
        else:
            return mu, tf.tile(cov[None, :, :], [mu.shape[-1], 1, 1])
Пример #3
0
    def custom_predict_f(self,
                         Xnew: InputData,
                         full_cov: bool = False,
                         full_output_cov: bool = False) -> MeanAndVariance:
        """
        Compute the mean and variance of the latent function at some new points.
        Note that this is very similar to the SGPR prediction, for which
        there are notes in the SGPR notebook.

        Note: This model does not allow full output covariances.

        :param Xnew: points at which to predict
        """
        if full_output_cov:
            raise NotImplementedError

        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        mu, cov = self.compute_qu()

        jitter = default_jitter()
        Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
        L = tf.linalg.cholesky(
            covariances.Kuu(self.inducing_variable, self.kernel,
                            jitter=jitter))

        var = cov

        tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True)  #L^{-1} K_{us}
        tmp2 = tf.linalg.triangular_solve(L, mu, lower=True)  # L^{-1} m

        mean = tf.linalg.matmul(
            tmp1, tmp2, transpose_a=True
        )  #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook
        return mean + self.mean_function(Xnew), var
Пример #4
0
def dirac_diag():
    return DiagonalGaussian(
        tf.convert_to_tensor(Data.Xmu),
        tf.convert_to_tensor(np.zeros((Data.num_data, Data.D_in))))
Пример #5
0
def gauss_diag():
    return DiagonalGaussian(
        tf.convert_to_tensor(Data.Xmu),
        tf.convert_to_tensor(rng.rand(Data.num_data, Data.D_in)))
Пример #6
0
def uncertain_conditional_diag(
    Xnew_mu: tf.Tensor,
    Xnew_var: tf.Tensor,
    inducing_variable: InducingVariables,
    kernel: Kernel,
    q_mu,
    q_sqrt,
    *,
    mean_function=None,
    full_output_cov=False,
    full_cov=False,
    white=False,
):
    """
    Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var).
    See ``conditional`` documentation for further reference.
    :param Xnew_mu: mean of the inputs, size [N, D]in
    :param Xnew_var: covariance matrix of the inputs, size [N, n, n]
    :param inducing_variable: gpflow.InducingVariable object, only InducingPoints is supported
    :param kernel: gpflow kernel object.
    :param q_mu: mean inducing points, size [M, Dout]
    :param q_sqrt: cholesky of the covariance matrix of the inducing points, size [t, M, M]
    :param full_output_cov: boolean wheter to compute covariance between output dimension.
                            Influences the shape of return value ``fvar``. Default is False
    :param white: boolean whether to use whitened representation. Default is False.
    :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is [N, Dout],
            size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is [N, t, t],
            if False then ``f_var`` is [N, Dout]
    """

    if not isinstance(inducing_variable, InducingPoints):
        raise NotImplementedError

    if full_cov:
        raise NotImplementedError(
            "uncertain_conditional() currently does not support full_cov=True")

    # pX = DiagonalGaussian(self.X_data_mean, self.X_data_var)

    # Y_data = self.data
    # mu, cov = self.compute_qu()

    # jitter = default_jitter()
    # Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew)
    # L = tf.linalg.cholesky(covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter))

    # var = cov

    # tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us}
    # tmp2 = tf.linalg.triangular_solve(L, mu, lower=True)  # L^{-1} m

    # mean = tf.linalg.matmul(tmp1, tmp2, transpose_a=True) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook
    # return mean + self.mean_function(Xnew), var

    pXnew = DiagonalGaussian(Xnew_mu, Xnew_var)

    num_data = tf.shape(Xnew_mu)[0]  # number of new inputs (N)
    num_ind, num_func = tf.unstack(
        tf.shape(q_mu), num=2,
        axis=0)  # number of inducing points (M), output dimension (D)
    q_sqrt_r = tf.linalg.band_part(
        q_sqrt, -1, 0)  # [D, M, M] #taking the lower triangular part

    eKuf = tf.transpose(expectation(
        pXnew, (kernel, inducing_variable)))  # [M, N] (psi1)
    Kuu = covariances.Kuu(inducing_variable, kernel,
                          jitter=default_jitter())  # [M, M]
    Luu = tf.linalg.cholesky(Kuu)  # [M, M]

    if not white:
        q_mu = tf.linalg.triangular_solve(Luu, q_mu, lower=True)
        Luu_tiled = tf.tile(
            Luu[None, :, :],
            [num_func, 1, 1])  # remove line once issue 216 is fixed
        q_sqrt_r = tf.linalg.triangular_solve(Luu_tiled, q_sqrt_r, lower=True)

    Li_eKuf = tf.linalg.triangular_solve(Luu, eKuf, lower=True)  # [M, N]
    fmean = tf.linalg.matmul(Li_eKuf, q_mu, transpose_a=True)

    eKff = expectation(pXnew, kernel)  # N (psi0)
    eKuffu = expectation(pXnew, (kernel, inducing_variable),
                         (kernel, inducing_variable))  # [N, M, M] (psi2)
    Luu_tiled = tf.tile(
        Luu[None, :, :],
        [num_data, 1, 1])  # remove this line, once issue 216 is fixed
    Li_eKuffu = tf.linalg.triangular_solve(Luu_tiled, eKuffu, lower=True)
    Li_eKuffu_Lit = tf.linalg.triangular_solve(Luu_tiled,
                                               tf.linalg.adjoint(Li_eKuffu),
                                               lower=True)  # [N, M, M]
    cov = tf.linalg.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True)  # [D, M, M]

    if mean_function is None or isinstance(mean_function, mean_functions.Zero):
        e_related_to_mean = tf.zeros((num_data, num_func, num_func),
                                     dtype=default_float())
    else:
        # Update mean: \mu(x) + m(x)
        fmean = fmean + expectation(pXnew, mean_function)

        # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T,
        # where m(x) is the mean_function and \mu(x) is fmean
        e_mean_mean = expectation(pXnew, mean_function,
                                  mean_function)  # [N, D, D]
        Lit_q_mu = tf.linalg.triangular_solve(Luu, q_mu, adjoint=True)
        e_mean_Kuf = expectation(pXnew, mean_function,
                                 (kernel, inducing_variable))  # [N, D, M]
        # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor:
        e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind])
        e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf,
                                 Lit_q_mu)  # [N, D, D]
        e_related_to_mean = e_fmean_mean + tf.linalg.adjoint(
            e_fmean_mean) + e_mean_mean

    if full_output_cov:
        fvar = (
            tf.linalg.diag(
                tf.tile((eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None],
                        [1, num_func])) +
            tf.linalg.diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) +
            # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) +
            tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) -
            # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
            fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean)
    else:
        fvar = (
            (eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None] +
            tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)
            # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) +
            + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu)

            # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
            - fmean**2 + tf.linalg.diag_part(e_related_to_mean))

    return fmean, fvar
Пример #7
0

_means = {
    "lin": mf.Linear(A=rng.randn(D_in, D_out), b=rng.randn(D_out)),
    "identity": mf.Identity(input_dim=D_in),
    "const": mf.Constant(c=rng.randn(D_out)),
    "zero": mf.Zero(output_dim=D_out),
}

_distrs = {
    "gauss":
    Gaussian(Xmu, Xcov),
    "dirac_gauss":
    Gaussian(Xmu, np.zeros((num_data, D_in, D_in))),
    "gauss_diag":
    DiagonalGaussian(Xmu, rng.rand(num_data, D_in)),
    "dirac_diag":
    DiagonalGaussian(Xmu, np.zeros((num_data, D_in))),
    "dirac_markov_gauss":
    MarkovGaussian(Xmu_markov, np.zeros((2, num_data + 1, D_in, D_in))),
    "markov_gauss":
    markov_gauss(),
}

_kerns = {
    "rbf":
    kernels.SquaredExponential(variance=rng.rand(),
                               lengthscales=rng.rand() + 1.0),
    "lin":
    kernels.Linear(variance=rng.rand()),
    "matern":
Пример #8
0
def gplvm_build_predict(self, Xnew, X_mean, X_var, Y, variance, full_cov=False):
    if X_var is None:
        # SGPR
        num_inducing = len(self.feature)
        err = Y - self.mean_function(X_mean)
        Kuf = self.feature.Kuf(self.kern, X_mean)
        Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level)
        Kus = self.feature.Kuf(self.kern, Xnew)
        sigma = tf.sqrt(variance)
        L = tf.cholesky(Kuu)
        A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma
        B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type)
        LB = tf.cholesky(B)
        Aerr = tf.matmul(A, err)
        c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma
        tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True)
        tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True)
        mean = tf.matmul(tmp2, c, transpose_a=True)
        if full_cov:
            var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \
                  - tf.matmul(tmp1, tmp1, transpose_a=True)
            shape = tf.stack([1, 1, tf.shape(Y)[1]])
            var = tf.tile(tf.expand_dims(var, 2), shape)
        else:
            var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \
                  - tf.reduce_sum(tf.square(tmp1), 0)
            shape = tf.stack([1, tf.shape(Y)[1]])
            var = tf.tile(tf.expand_dims(var, 1), shape)
        return mean + self.mean_function(Xnew), var

    else:
        # gplvm
        pX = DiagonalGaussian(X_mean, X_var)
        num_inducing = len(self.feature)

        X_cov = tf.matrix_diag(X_var)

        if hasattr(self.kern, 'X_input_dim'):
            psi1 = self.kern.eKxz(self.feature.Z, X_mean, X_cov)
            psi2 = tf.reduce_sum(self.kern.eKzxKxz(self.feature.Z, X_mean, X_cov), 0)
        else:
            psi1 = expectation(pX, (self.kern, self.feature))
            psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0)

        # psi1 = expectation(pX, (self.kern, self.feature))
        # psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0)

        Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level)
        Kus = self.feature.Kuf(self.kern, Xnew)
        sigma2 = variance
        sigma = tf.sqrt(sigma2)
        L = tf.cholesky(Kuu)

        A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma
        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)
        AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=settings.float_type)
        LB = tf.cholesky(B)
        c = tf.matrix_triangular_solve(LB, tf.matmul(A, Y), lower=True) / sigma
        tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True)
        tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True)
        mean = tf.matmul(tmp2, c, transpose_a=True)
        if full_cov:
            var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \
                  - tf.matmul(tmp1, tmp1, transpose_a=True)
            shape = tf.stack([1, 1, tf.shape(Y)[1]])
            var = tf.tile(tf.expand_dims(var, 2), shape)
        else:
            var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \
                  - tf.reduce_sum(tf.square(tmp1), 0)
            shape = tf.stack([1, tf.shape(Y)[1]])
            var = tf.tile(tf.expand_dims(var, 1), shape)
        return mean + self.mean_function(Xnew), var
Пример #9
0
def gplvm_build_likelihood(self, X_mean, X_var, Y, variance):
    if X_var is None:
        # SGPR
        num_inducing = len(self.feature)
        num_data = tf.cast(tf.shape(Y)[0], settings.float_type)
        output_dim = tf.cast(tf.shape(Y)[1], settings.float_type)

        err = Y - self.mean_function(X_mean)
        Kdiag = self.kern.Kdiag(X_mean)
        Kuf = self.feature.Kuf(self.kern, X_mean)
        Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level)
        L = tf.cholesky(Kuu)
        sigma = tf.sqrt(variance)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma
        AAT = tf.matmul(A, A, transpose_b=True)
        B = AAT + tf.eye(num_inducing, dtype=settings.float_type)
        LB = tf.cholesky(B)
        Aerr = tf.matmul(A, err)
        c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma

        # compute log marginal bound
        bound = -0.5 * num_data * output_dim * np.log(2 * np.pi)
        bound += tf.negative(output_dim) * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB)))
        bound -= 0.5 * num_data * output_dim * tf.log(variance)
        bound += -0.5 * tf.reduce_sum(tf.square(err)) / variance
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * output_dim * tf.reduce_sum(Kdiag) / variance
        bound += 0.5 * output_dim * tf.reduce_sum(tf.matrix_diag_part(AAT))

        return bound


    else:

        X_cov = tf.matrix_diag(X_var)
        pX = DiagonalGaussian(X_mean, X_var)
        num_inducing = len(self.feature)
        if hasattr(self.kern, 'X_input_dim'):
            psi0 = tf.reduce_sum(self.kern.eKdiag(X_mean, X_cov))
            psi1 = self.kern.eKxz(self.feature.Z, X_mean, X_cov)
            psi2 = tf.reduce_sum(self.kern.eKzxKxz(self.feature.Z, X_mean, X_cov), 0)
        else:
            psi0 = tf.reduce_sum(expectation(pX, self.kern))
            psi1 = expectation(pX, (self.kern, self.feature))
            psi2 = tf.reduce_sum(expectation(pX, (self.kern, self.feature), (self.kern, self.feature)), axis=0)
        Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level)
        L = tf.cholesky(Kuu)
        sigma2 = variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma
        tmp = tf.matrix_triangular_solve(L, psi2, lower=True)
        AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=settings.float_type)
        LB = tf.cholesky(B)
        log_det_B = 2. * tf.reduce_sum(tf.log(tf.matrix_diag_part(LB)))
        c = tf.matrix_triangular_solve(LB, tf.matmul(A, Y), lower=True) / sigma

        # KL[q(x) || p(x)]
        # dX_var = self.X_var if len(self.X_var.get_shape()) == 2 else tf.matrix_diag_part(self.X_var)
        # NQ = tf.cast(tf.size(self.X_mean), settings.float_type)
        D = tf.cast(tf.shape(Y)[1], settings.float_type)
        # KL = -0.5 * tf.reduce_sum(tf.log(dX_var)) \
        #      + 0.5 * tf.reduce_sum(tf.log(self.X_prior_var)) \
        #      - 0.5 * NQ \
        #      + 0.5 * tf.reduce_sum((tf.square(self.X_mean - self.X_prior_mean) + dX_var) / self.X_prior_var)

        # compute log marginal bound
        ND = tf.cast(tf.size(Y), settings.float_type)
        bound = -0.5 * ND * tf.log(2 * np.pi * sigma2)
        bound += -0.5 * D * log_det_B
        bound += -0.5 * tf.reduce_sum(tf.square(Y)) / sigma2
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 -
                             tf.reduce_sum(tf.matrix_diag_part(AAT)))
        # bound -= KL # don't need this term
        return bound
Пример #10
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))

        Y = self.data

        qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
        psi0s = expectation(qXs, self.kernel_s)
        psi1s = expectation(qXs, (self.kernel_s, self.Zs))
        psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
        cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        Ls = tf.linalg.cholesky(cov_uu_s)
        Ls = repeat_N(Ls) # [N x M x M]

        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)
        psi0k = []
        psi1k = []
        psi2k = []
        psi2ks = []
        psi2sk = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0 = expectation(qXp, kernel_k)
            psi1 = expectation(qXp, (kernel_k, self.Zp))
            psi2 = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            psi0k.append(psi0)            
            psi1k.append(psi1)
            psi2k.append(psi2)
            # add the cross-covariance terms, require computation separately for each n
            psi2sk.append(tf.vectorized_map(row_outer_product, (psi1s, psi1)))
            #psi2ks.append(tf.vectorized_map(row_outer_product, (psi1, psi1s)))
        psi0k = tf.stack(psi0k, axis=-1)
        psi1k = tf.stack(psi1k, axis=-1)
        psi2k = tf.stack(psi2k, axis=-1)
        psi2sk = tf.stack(psi2sk, axis=-1)
        #psi2ks = tf.stack(psi2ks, axis=-1)  

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        Lk = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            Lk.append(tf.linalg.cholesky(cov_uu_k))
        Lk = tf.stack(Lk, axis=-1)
        Lk = repeat_N(Lk)
        
        sigma2 = self.likelihood.variance
        jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())

        tmp = tf.vectorized_map(triang_solve, (Ls, psi2s))
        As = tf.vectorized_map(triang_solve_transpose, (Ls, tmp)) # \inv{Kuu^s} * Psi2s: [N, M, M]

        LBs = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2s) # [N, M, M]  
        tmp1 = tf.vectorized_map(triang_solve, (Ls, LBs)) # [N, M, M]
        Cs = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1)) # sqrt(\inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, M, M]
        Ds = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), Cs)) # sqrt(Ms^T * \inv{Kuu^s} * Psi2s * \inv{Kuu^s} * Ms): [N, D, M]

        Fs = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt_s, perm=[0, 2, 1])), Cs)) # sqrt(Ss * \inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, D, M, M]

        tmp2 = tf.vectorized_map(triang_solve, (Ls, repeat_N(self.q_mu_s)))
        Es = tf.vectorized_map(triang_solve_transpose, (Ls, tmp2)) # \inv{Kuu^s} * Ms: [N, M, D]
        tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1s)) # Y^T * Psi1: [N, D, M]
        Gs = tf.vectorized_map(matmul_vectorized, (tmp3, Es)) # Y^T * Psi1s * \inv{Kuu^s} * Ms: [N, D, D]

        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            tmp = tf.vectorized_map(triang_solve, (Lk[..., k], psi2k[..., k])) # [N, M, M]
            Ak = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp)) # \inv{Kuu^k} * Psi2k: [N, M, M]

            LBk = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2k[..., k]) # [N, M, M]  
            tmp1k = tf.vectorized_map(triang_solve, (Lk[..., k], LBk)) # [N, M, M]
            Ck = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp1k)) # sqrt(\inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, M, M]
            Dk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), Ck)) # sqrt(Mk^T * \inv{Kuu^k} * Psi2k * \inv{Kuu^k} * Mk): [N, D, M]

            # q_sqrt is already the cholesky
            Fk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), Ck)) # sqrt(Sk * \inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, D, M, M]

            tmp2 = tf.vectorized_map(triang_solve, (Lk[..., k], repeat_N(self.q_mu[k])))
            Ek = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp2)) # \inv{Kuu^k} * Mk: [N, M, D]
            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1k[..., k])) # Y^T * Psi1k: [N, D, M]
            Gk = tf.vectorized_map(matmul_vectorized, (tmp3, Ek)) # Y^T * Psi1k * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the cross terms 
            tmp1sk = tf.vectorized_map(triang_solve, (Ls, psi2sk[..., k]))
            tmp2sk = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1sk)) # \inv{Kuu^s} * Psi2sk: [N, M, M]
            tmp3sk = tf.vectorized_map(matmul_vectorized, (tmp2sk, Ek)) # \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, M, D]
            Dsk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), tmp3sk)) # Ms^T * \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, D, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(trace_tf, Gs + Gk) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Ds) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Dk) / sigma2
            # the sum of trace of the 2 cross terms is 2 times the trace of one since they are transpose of one another
            Fnk += - tf.vectorized_map(trace_tf, Dsk) / sigma2 
            Fnk += 0.5 * self.D * tf.vectorized_map(trace_tf, As + Ak)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fs) / sigma2
            Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fk) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * (tf.repeat(tf.expand_dims(psi0s, -1), self.K, axis=1) + psi0k) / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
        
        prior_Kuu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        KL_us = kullback_leiblers.gauss_kl(q_mu=self.q_mu_s, q_sqrt=self.q_sqrt_s, K=prior_Kuu_s)
        KL_uk = 0
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_uk += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu_k)
        bound += - KL_s - KL_p - KL_us - KL_uk - KL_c

        return bound
Пример #11
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """  

        # defining a sets of vectorized function for usage in `tf.vectorized_map`

        # take the outer product of a pair of rows
        @tf.function
        def row_outer_product(args):
            a, b = args
            a = tf.expand_dims(a, -1)
            b = tf.expand_dims(b, -1)
            return a @ tf.transpose(b)

        # repeat matrix A N times on a newly created first axis 
        # so the new shape is [N, A.shape] 
        @tf.function
        def repeat_N(A):
            return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0)

        @tf.function
        def triang_solve(args):
            L, rhs = args
            return tf.linalg.triangular_solve(L, rhs)

        @tf.function
        def triang_solve_transpose(args):
            L, rhs = args
            return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False)

        @tf.function
        def matmul_vectorized(args):
            A, B = args
            return tf.matmul(A, B)

        # [N, D, M, M] --> [N]
        # each term is sum_{d=1}^D Tr[M, M]
        # arg: [D, M, M], needs to be squared
        @tf.function
        def sum_d_trace(arg):
            trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg)
            return tf.reduce_sum(trace_D)

        # trace of a matrix
        @tf.function
        def trace_tf(A):
            return tf.reduce_sum(tf.linalg.diag_part(A))


        Y = self.data

        # specify qXp, the variational distribution q(X): each x_n is independent w/ N(x_n | \mu_n, S_n)
        # \mu_n \in R^q given by each row of `X_data_mean`
        # S_n \in R^qxq diagonal, so equivalently given by each row of `X_data_var`
        qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var)

        # if split space, specify qXs
        # compute psi statistics for the shared space, keep the original shape of psi statistics, use qXs and kernel_s
        # psi0s is N-vector
        # psi1s is [N, M]
        # psi2s is [N, M, M]
        # also compute the covariance matrix Kuu for the shared space
        if self.split_space:
            qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var)
            psi0s = expectation(qXs, self.kernel_s)
            psi1s = expectation(qXs, (self.kernel_s, self.Zs))
            psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs))
            cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())


        # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together
        # for each k: if no shared space, then psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k
        # if have shared space, then psi0[:, k] = psi0s + psi0k, psi1[:, :, k] = psi1s + psi1k
        # psi2[:, :, :, k] = psi2s + psi2k (the cross terms are added later)
        # then, for each n, psi2[n, :, :, k] = psi1s[n, :]^T dot psi1k[n, :] + psi1k[n, :]^T dot psi1s[n, :] (both are [M, M])
        # psi0 is [N, K] so psi0[n, k] gives a real value
        # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector
        # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix
        psi0 = []
        psi1 = []
        psi2 = []
        for k, kernel_k in enumerate(self.kernel_K):
            psi0k = expectation(qXp, kernel_k)
            psi1k = expectation(qXp, (kernel_k, self.Zp))
            psi2k = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp))
            if self.split_space:
                psi0.append(psi0s + psi0k)            
                psi1.append(psi1s + psi1k)
                # add the cross-covariance terms, require computation separately for each n
                sxk = tf.vectorized_map(row_outer_product, (psi1s, psi1k))
                kxs = tf.vectorized_map(row_outer_product, (psi1k, psi1s))
                psi2.append(psi2s + psi2k + sxk + kxs)
            else:
                psi0.append(psi0k)
                psi1.append(psi1k)
                psi2.append(psi2k)
        psi0 = tf.stack(psi0, axis=-1)
        psi1 = tf.stack(psi1, axis=-1)
        psi2 = tf.stack(psi2, axis=-1)

        # make K cov_uu_k using Zp and kernel_k
        # K cholesky, repeat N times for later use
        # L is [N x M x M x K]
        # these are the Kuu matrices
        L = []
        for k, kernel_k in enumerate(self.kernel_K):
            cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter())
            if self.split_space:
                L.append(tf.linalg.cholesky(cov_uu_s + cov_uu_k))
            else:
                L.append(tf.linalg.cholesky(cov_uu_k))
        L = tf.stack(L, axis=-1)
        L = repeat_N(L)
        sigma2 = self.likelihood.variance


        # self.pred_Y = []

        # use `tf.vectorized_map` to avoid writing a loop over N, but it requires every matrix to have N on axis 0
        # so we need to repeat certain matrices that are the same for all N (e.g. L)
        # note we can use `tf.vectorized_map` because the computations are decomposable for each n,
        # i.e. they can be computed in any order over n
        Fq = []
        Yn2 = tf.reduce_sum(tf.square(Y), axis=1)
        for k in range(self.K):
            # compute intermediate matrices for easier computation involving \inv{Kuu}
            # A is the same as AAT in gplvm, transposing L is the correct thing to do
            # but the two end up being the same since we only care about the trace
            tmp = tf.vectorized_map(triang_solve, (L[..., k], psi2[..., k])) # [N, M, M]
            A = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp)) # \inv{Kuu} * Psi2: [N, M, M]

            #pos_def = tf.vectorized_map(lambda x: is_pos_def(x), psi2[..., k])
            #print(np.all(pos_def))
            # psi2 is not produced using w/ `covariances.Kuu`, but it should still be PD
            # we should add jitter before doing cholesky
            #jitter_mtx = default_jitter() * tf.eye(self.M, dtype=default_float())
            jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float())
            LB = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2[..., k]) # [N, M, M]  
            tmp1 = tf.vectorized_map(triang_solve, (L[..., k], LB)) # [N, M, M]
            C = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp1)) # sqrt(\inv{Kuu} * Psi2 * \inv{Kuu}): [N, M, M]

            D = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), C)) # sqrt(M^T * \inv{Kuu} * Psi2 * \inv{Kuu} * M): [N, D, M]

            tmp2 = tf.vectorized_map(triang_solve, (L[..., k], repeat_N(self.q_mu[k])))
            E = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp2)) # \inv{Kuu} * M: [N, M, D]

            # q_sqrt is already the cholesky
            F = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), C)) # sqrt(S * \inv{Kuu} * Psi2 * \inv{Kuu}): [N, D, M, M]

            tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1[..., k])) # Y^T * Psi1: [N, D, M]
            G = tf.vectorized_map(matmul_vectorized, (tmp3, E)) # Y^T * Psi1 * \inv{Kuu} * M: [N, D, D]

            # for debugging 
            # self.pred_Y.append(tf.reshape(tf.vectorized_map(matmul_vectorized, (tf.expand_dims(psi1[..., k], 1), E)), (self.N, self.D))) # Psi1 * \inv{Kuu} * M: [N, D]

            # compute the lower bound
            # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k
            Fnk = -0.5 * Yn2 / sigma2
            Fnk += tf.vectorized_map(lambda x: trace_tf(x), G) / sigma2
            Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), D) / sigma2
            Fnk += 0.5 * self.D * tf.vectorized_map(lambda x: trace_tf(x), A)  / sigma2 
            Fnk += -0.5 * tf.vectorized_map(lambda x: sum_d_trace(x), F) / sigma2

            Fq.append(Fnk)

        Fq = tf.stack(Fq, axis=-1) # [N, K]
        # psi0 is already [N, K]
        Fq += -0.5 * self.D * psi0 / sigma2
        Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2)

        # for debugging 
        #self.Fq = Fq
        # self.pred_Y = tf.stack(self.pred_Y, axis=-1) # [N, D, K]

        # weight each entry by the mixture responsibility, then sum over N, K
        bound = tf.reduce_sum(Fq * self.pi)

        # compute KL 
        KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var)
        KL_c = self.kl_categorical(self.pi, self.pi_prior)
        KL_u = 0
        prior_Kuu = np.zeros((self.M, self.M))
        if self.split_space:
            KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var)
            bound += - KL_s
            prior_Kuu += covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter())
        for k in range(self.K):
            prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter())
            KL_u += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu+prior_Kuu_k)
        bound += - KL_p - KL_u - KL_c

        return bound
Пример #12
0
    def elbo(self) -> tf.Tensor:
        """
        Construct a tensorflow function to compute the bound on the marginal
        likelihood.
        """
        Y_data = self.data

        X_data_mean, X_data_var = self.encoder(Y_data)

        pX = DiagonalGaussian(X_data_mean, X_data_var)

        num_inducing = self.inducing_variable.num_inducing
        psi0 = tf.reduce_sum(expectation(pX, self.kernel))
        psi1 = expectation(pX, (self.kernel, self.inducing_variable))
        psi2 = tf.reduce_sum(
            expectation(pX, (self.kernel, self.inducing_variable),
                        (self.kernel, self.inducing_variable)),
            axis=0)

        cov_uu = covariances.Kuu(self.inducing_variable,
                                 self.kernel,
                                 jitter=default_jitter())
        L = tf.linalg.cholesky(cov_uu)
        sigma2 = self.likelihood.variance
        sigma = tf.sqrt(sigma2)

        # Compute intermediate matrices
        A = tf.linalg.triangular_solve(L, tf.transpose(psi1),
                                       lower=True) / sigma
        tmp = tf.linalg.triangular_solve(L, psi2, lower=True)
        AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp),
                                         lower=True) / sigma2
        B = AAT + tf.eye(num_inducing, dtype=default_float())
        LB = tf.linalg.cholesky(B)
        log_det_B = 2.0 * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB)))
        c = tf.linalg.triangular_solve(
            LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma

        # KL[q(x) || p(x)]
        dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else
                       tf.linalg.diag_part(X_data_var))
        NQ = to_default_float(tf.size(X_data_mean))
        D = to_default_float(tf.shape(Y_data)[1])
        KL = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var))
        KL += 0.5 * tf.reduce_sum(tf.math.log(self.X_prior_var))
        KL -= 0.5 * NQ
        KL += 0.5 * tf.reduce_sum(
            (tf.square(X_data_mean - self.X_prior_mean) + dX_data_var) /
            self.X_prior_var)

        self.loss_placeholder["KL_x"].append(KL.numpy())

        # compute log marginal bound
        ND = to_default_float(tf.size(Y_data))
        bound = -0.5 * ND * tf.math.log(2 * np.pi * sigma2)
        bound += -0.5 * D * log_det_B
        bound += -0.5 * tf.reduce_sum(tf.square(Y_data)) / sigma2
        bound += 0.5 * tf.reduce_sum(tf.square(c))
        bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 -
                             tf.reduce_sum(tf.linalg.diag_part(AAT)))
        bound -= KL

        self.loss_placeholder["ELBO"].append(bound.numpy())

        return bound
Пример #13
0
class Data:
    rng = np.random.RandomState(1)
    num_data = 5
    num_ind = 4
    D_in = 2
    D_out = 2

    Xmu = rng.randn(num_data, D_in)
    L = gen_L(rng, num_data, D_in, D_in)
    Xvar = np.array([l @ l.T for l in L])
    Z = rng.randn(num_ind, D_in)

    # distributions don't need to be compiled (No Parameter objects)
    # but the members should be Tensors created in the same graph
    graph = tf.Graph()
    with test_util.session_context(graph) as sess:
        gauss = Gaussian(tf.constant(Xmu), tf.constant(Xvar))
        dirac = Gaussian(tf.constant(Xmu),
                         tf.constant(np.zeros((num_data, D_in, D_in))))
        gauss_diag = DiagonalGaussian(tf.constant(Xmu),
                                      tf.constant(rng.rand(num_data, D_in)))
        dirac_diag = DiagonalGaussian(tf.constant(Xmu),
                                      tf.constant(np.zeros((num_data, D_in))))
        dirac_markov_gauss = MarkovGaussian(
            tf.constant(Xmu), tf.constant(np.zeros((2, num_data, D_in, D_in))))

        # create the covariance for the pairwise markov-gaussian
        dummy_gen = lambda rng, n, *shape: np.array(
            [rng.randn(*shape) for _ in range(n)])
        L_mg = dummy_gen(rng, num_data, D_in, 2 * D_in)  # N+1 x D x 2D
        LL = np.concatenate((L_mg[:-1], L_mg[1:]), 1)  # N x 2D x 2D
        Xcov = LL @ np.transpose(LL, (0, 2, 1))
        Xc = np.concatenate((Xcov[:, :D_in, :D_in], Xcov[-1:, D_in:, D_in:]),
                            0)  # N+1 x D x D
        Xcross = np.concatenate(
            (Xcov[:, :D_in, D_in:], np.zeros(
                (1, D_in, D_in))), 0)  # N+1 x D x D
        Xcc = np.stack([Xc, Xcross])  # 2 x N+1 x D x D

        markov_gauss = MarkovGaussian(Xmu, Xcc)

    with gpflow.decors.defer_build():
        # features
        ip = features.InducingPoints(Z)
        # kernels
        rbf_prod_seperate_dims = kernels.Product([
            kernels.RBF(1,
                        variance=rng.rand(),
                        lengthscales=rng.rand(),
                        active_dims=[0]),
            kernels.RBF(1,
                        variance=rng.rand(),
                        lengthscales=rng.rand(),
                        active_dims=[1])
        ])

        rbf_lin_sum = kernels.Sum([
            kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand()),
            kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand()),
            kernels.Linear(D_in, variance=rng.rand())
        ])

        rbf = kernels.RBF(D_in, variance=rng.rand(), lengthscales=rng.rand())

        lin_kern = kernels.Linear(D_in, variance=rng.rand())

        # mean functions
        lin = mean_functions.Linear(rng.rand(D_in, D_out), rng.rand(D_out))
        iden = mean_functions.Identity(
            D_in)  # Note: Identity can only be used if Din == Dout
        zero = mean_functions.Zero(output_dim=D_out)
        const = mean_functions.Constant(rng.rand(D_out))