Exemplo n.º 1
0
class SVGP_Layer(Layer):
    def __init__(self,
                 layer_id,
                 kern,
                 U,
                 Z,
                 num_outputs,
                 mean_function,
                 white=False,
                 **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.
        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)
        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)
        The layer holds D_out independent GPs with the same kernel and inducing points.
        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, layer_id, U, num_outputs, **kwargs)

        #Initialize using kmeans

        self.dim_in = U[0].shape[1] if layer_id == 0 else num_outputs
        self.Z = Z if Z is not None else np.random.normal(
            0, 0.01, (100, self.dim_in))

        self.num_inducing = self.Z.shape[0]

        q_mu = np.zeros((self.num_inducing, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(
            np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(self.num_inducing,
                                               num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(self.Z)
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white

        if not self.white:  # initialize to prior
            Ku = self.kern.compute_K_symm(self.Z)
            Lu = np.linalg.cholesky(Ku +
                                    np.eye(self.Z.shape[0]) * settings.jitter)
            self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])

        self.needs_build_cholesky = True

    @params_as_tensors
    def build_cholesky_if_needed(self):
        # make sure we only compute this once
        if self.needs_build_cholesky:
            self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter)
            self.Lu = tf.cholesky(self.Ku)
            self.Ku_tiled = tf.tile(self.Ku[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.Lu_tiled = tf.tile(self.Lu[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.needs_build_cholesky = False

    def conditional_ND(self, X, full_cov=False):
        self.build_cholesky_if_needed()

        # mmean, vvar = conditional(X, self.feature.Z, self.kern,
        #             self.q_mu, q_sqrt=self.q_sqrt,
        #             full_cov=full_cov, white=self.white)
        Kuf = self.feature.Kuf(self.kern, X)

        A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True)
        if not self.white:
            A = tf.matrix_triangular_solve(tf.transpose(self.Lu),
                                           A,
                                           lower=False)

        mean = tf.matmul(A, self.q_mu, transpose_a=True)

        A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1])
        I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :]

        if self.white:
            SK = -I
        else:
            SK = -self.Ku_tiled

        if self.q_sqrt is not None:
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        B = tf.matmul(SK, A_tiled)

        if full_cov:
            # (num_latent, num_X, num_X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)
            Kff = self.kern.K(X)
        else:
            # (num_latent, num_X)
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            Kff = self.kern.Kdiag(X)

        # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X)
        var = tf.expand_dims(Kff, 0) + delta_cov
        var = tf.transpose(var)

        return mean + self.mean_function(X), var

    def KL(self):
        """
        The KL divergence from the variational distribution to the prior
        :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP
        """
        # if self.white:
        #     return gauss_kl(self.q_mu, self.q_sqrt)
        # else:
        #     return gauss_kl(self.q_mu, self.q_sqrt, self.Ku)

        self.build_cholesky_if_needed()

        KL = -0.5 * self.num_outputs * self.num_inducing
        KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.q_sqrt)**2))

        if not self.white:
            KL += tf.reduce_sum(tf.log(tf.matrix_diag_part(
                self.Lu))) * self.num_outputs
            KL += 0.5 * tf.reduce_sum(
                tf.square(
                    tf.matrix_triangular_solve(
                        self.Lu_tiled, self.q_sqrt, lower=True)))
            Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu)
            KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m)
        else:
            KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt))
            KL += 0.5 * tf.reduce_sum(self.q_mu**2)

        return KL
Exemplo n.º 2
0
class SVGP_Layer(Layer):
    def __init__(self,
                 kern,
                 Z,
                 num_outputs,
                 mean_function,
                 white=False,
                 input_prop_dim=None,
                 **kwargs):
        """
        A sparse variational GP layer in whitened representation. This layer holds the kernel,
        variational parameters, inducing points and mean function.

        The underlying model at inputs X is
        f = Lv + mean_function(X), where v \sim N(0, I) and LL^T = kern.K(X)

        The variational distribution over the inducing points is
        q(v) = N(q_mu, q_sqrt q_sqrt^T)

        The layer holds D_out independent GPs with the same kernel and inducing points.

        :param kern: The kernel for the layer (input_dim = D_in)
        :param Z: Inducing points (M, D_in)
        :param num_outputs: The number of GP outputs (q_mu is shape (M, num_outputs))
        :param mean_function: The mean function
        :return:
        """
        Layer.__init__(self, input_prop_dim, **kwargs)
        self.num_inducing = Z.shape[0]

        q_mu = np.zeros((self.num_inducing, num_outputs))
        self.q_mu = Parameter(q_mu)

        q_sqrt = np.tile(
            np.eye(self.num_inducing)[None, :, :], [num_outputs, 1, 1])
        transform = transforms.LowerTriangular(self.num_inducing,
                                               num_matrices=num_outputs)
        self.q_sqrt = Parameter(q_sqrt, transform=transform)

        self.feature = InducingPoints(Z)
        self.kern = kern
        self.mean_function = mean_function

        self.num_outputs = num_outputs
        self.white = white  #tf.constant(white, shape=(), dtype = tf.bool)        #white #

        if not self.white:  # initialize to prior
            Ku = self.kern.compute_K_symm(Z)
            Lu = np.linalg.cholesky(Ku + np.eye(Z.shape[0]) * settings.jitter)
            self.q_sqrt = np.tile(Lu[None, :, :], [num_outputs, 1, 1])

        self.needs_build_cholesky = True

    @params_as_tensors
    def build_cholesky_if_needed(self):
        # make sure we only compute this once
        if self.needs_build_cholesky:
            self.Ku = self.feature.Kuu(self.kern, jitter=settings.jitter)
            self.Lu = tf.cholesky(self.Ku)
            self.Ku_tiled = tf.tile(self.Ku[None, :, :],
                                    [self.num_outputs, 1, 1])
            self.Lu_tiled = tf.tile(self.Lu[None, :, :],
                                    [self.num_outputs, 1, 1])
            #also compute K_inverse and it's det
            if not self.white:
                inp_ = (self.Ku + tf.eye(self.num_inducing, dtype=tf.float64) *
                        settings.jitter * 10)
                self.K_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64))

            self.needs_build_cholesky = False

    def conditional_ND(self, X, full_cov=False):
        self.build_cholesky_if_needed()

        # mmean, vvar = conditional(X, self.feature.Z, self.kern,
        #             self.q_mu, q_sqrt=self.q_sqrt,
        #             full_cov=full_cov, white=self.white)
        Kuf = self.feature.Kuf(self.kern, X)

        A = tf.matrix_triangular_solve(self.Lu, Kuf, lower=True)
        if not self.white:
            A = tf.matrix_triangular_solve(tf.transpose(self.Lu),
                                           A,
                                           lower=False)

        mean = tf.matmul(A, self.q_mu, transpose_a=True)

        A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1])
        I = tf.eye(self.num_inducing, dtype=settings.float_type)[None, :, :]

        if self.white:
            SK = -I
        else:
            SK = -self.Ku_tiled

        if self.q_sqrt is not None:
            SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True)

        B = tf.matmul(SK, A_tiled)

        if full_cov:
            # (num_latent, num_X, num_X)
            delta_cov = tf.matmul(A_tiled, B, transpose_a=True)
            Kff = self.kern.K(X)
        else:
            # (num_latent, num_X)
            delta_cov = tf.reduce_sum(A_tiled * B, 1)
            Kff = self.kern.Kdiag(X)

        # either (1, num_X) + (num_latent, num_X) or (1, num_X, num_X) + (num_latent, num_X, num_X)
        var = tf.expand_dims(Kff, 0) + delta_cov
        var = tf.transpose(var)

        return mean + self.mean_function(X), var

    def KL(self):
        """
        The KL divergence from the variational distribution to the prior. 
        Notation in paper is KL[q(u)||p(u)]. 
        
        OR the alpha-renyi divergence from variational distribution to the prior

        :return: KL divergence from N(q_mu, q_sqrt * q_sqrt^T) to N(0, I) (if whitened)
                and to N(mu(Z), K(Z)) otherwise, independently for each GP
        """
        # if self.white:
        #     return gauss_kl(self.q_mu, self.q_sqrt)
        # else:
        #     return gauss_kl(self.q_mu, self.q_sqrt, self.Ku)
        #

        self.build_cholesky_if_needed()
        if self.alpha is None:
            """Get KL regularizer"""

            KL = -0.5 * self.num_outputs * self.num_inducing
            KL -= 0.5 * tf.reduce_sum(
                tf.log(tf.matrix_diag_part(self.q_sqrt)**2))

            if not self.white:
                # Whitening is relative to the prior. Here, the prior is NOT
                # whitened, meaning that we have N(0, K(Z,Z)) as prior.
                KL += tf.reduce_sum(tf.log(tf.matrix_diag_part(
                    self.Lu))) * self.num_outputs
                KL += 0.5 * tf.reduce_sum(
                    tf.square(
                        tf.matrix_triangular_solve(
                            self.Lu_tiled, self.q_sqrt, lower=True)))
                Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu)
                KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m)
            else:
                KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt))
                KL += 0.5 * tf.reduce_sum(self.q_mu**2)

            return self.weight * KL
        else:
            """Get AR regularizer. For the normal, this means 
            log(Normalizing Constant[alpha * eta_q + (1-alpha) * eta_0 ]) - 
            alpha*log(Normalizing Constant[eta_q]) - 
            (1-alpha)*log(Normalizing Constant[eta_0]).
            
            NOTE: the 2*pi factor will cancel, as well as the 0.5 * factor.
            NOTE: q_strt is s.t. q_sqrt * q_sqrt^T = variational variance, i.e.
                  q(v) = N(q_mu, q_sqrt q_sqrt^T).
            NOTE: self.Lu is cholesky decomp of self.Ku
            NOTE: self.feature are the inducing points Z, and 
                  self.Ku = self.feature.Kuu(kernel), meaning that self.Ku is
                  the kernel matrix computed at the inducing points Z.
            NOTE: We need the alpha-renyi div between prior and GP-variational
                  posterior for EACH of the GPs in this layer.
                  
            Shapes:
                q_sqrt:                                 13 x 100 x 100
                q_mu:                                   100 x 13
                tf.matrix_diag_part(self.q_sqrt):       13 x 100
                q_sqrt_inv:                             13 x 100 x 100
                Ku, Lu:                                 100 x 100
                num_inducing:                           100
                num_outputs:                            13
            """

            #convenience
            alpha = self.alpha

            #INEFFICIENT, can probably be done much better with cholesky solve
            inp_ = (tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) +
                    tf.eye(self.num_inducing, dtype=tf.float64) *
                    settings.jitter * 100)
            q_inv = tf.linalg.inv(tf.cast(inp_, dtype=tf.float64))

            #gives Sigma_q^-1 * mu_q
            q_var_x_q_mu = tf.matmul(
                q_inv,
                tf.reshape(self.q_mu,
                           shape=(self.num_outputs, self.num_inducing, 1)))

            #Get the two log-normalizers for the variational posteriors
            q_component_1 = 0.5 * tf.reduce_sum(
                tf.log(tf.matrix_diag_part(self.q_sqrt)**2))
            q_component_2 = 0.5 * tf.reduce_sum(q_var_x_q_mu * self.q_mu)

            logZq = (q_component_1 + q_component_2)

            if not self.white:
                #prior using self.Lu, still 0 mean fct
                logZpi = 0.5 * tf.reduce_sum(
                    tf.log(tf.matrix_diag_part(self.Lu)**2)) * self.num_outputs

                new_Sigma_inv = (alpha * q_inv + (1.0 - alpha) * self.K_inv +
                                 tf.eye(self.num_inducing, dtype=tf.float64) *
                                 settings.jitter)  # +

            else:
                logZpi = 0.0  #* self.num_outputs * self.num_inducing  - but that is still 0.
                new_Sigma_inv = (alpha * q_inv +
                                 (1.0 - alpha + settings.jitter) *
                                 tf.eye(self.num_inducing, dtype=tf.float64))

            new_Sigma_inv_chol = tf.cholesky(tf.cast(new_Sigma_inv,
                                                     tf.float64))
            log_det = -tf.reduce_sum(
                tf.log(tf.matrix_diag_part(new_Sigma_inv_chol)**2))

            #Get the new inverse variance of the exponential family member
            #corresponding to alpha * eta_q + (1-alpha) * eta_0.
            #var_inv_new = tf.matmul(chol_var_inv_new, chol_var_inv_new, transpose_b=True)

            #Compute mu_new: Compute (Sigma^-1*mu) = A via
            #A = alpha* Sigma_q^-1 * q_mu + (1-alpha * 0) and then multiply
            #both sides by Sigma! => Problem: I don't know sigma!
            mu_new = tf.linalg.solve(
                tf.cast(new_Sigma_inv, dtype=tf.float64),
                tf.cast(alpha * q_var_x_q_mu, dtype=tf.float64))

            #Note: Sigma^{-1}_new * mu_new = Sigma^{-1}_q * mu_q, so
            #      mu_new' * Sigma^{-1}_new * mu_new = mu_new' * (Sigma^{-1}_q * mu_q)
            mu_new_x_new_Sigma_inv = tf.reduce_sum(alpha * q_var_x_q_mu *
                                                   mu_new)

            #Observing that log(|Sigma|) = - log(|Sigma|^-1), we can now get
            #the normalizing constant of the new exp. fam member.
            logZnew = (0.5 * mu_new_x_new_Sigma_inv + 0.5 * log_det)

            #return the log of the AR-div between the normals, i.e.
            # (1/(alpha * (1-alpha))) * log(D), where D =
            #new normalizer / (q_normalizer^alpha * prior_normalizer^(1-alpha))
            AR = (1.0 / (alpha * (1.0 - alpha))) * (logZnew - alpha * logZq -
                                                    (1.0 - alpha) * logZpi)

            return self.weight * AR