示例#1
0
    def parameters_changed(self):
        N, D = self.Y.shape

        Kss = self.kern.K(self.X)
        Ksu = self.kern.K(self.X, self.Z)

        wv = self.posterior.woodbury_vector
        wi = self.posterior.woodbury_inv
        
        a = self.Y - Ksu.dot(wv)
        
        C = Kss  + np.eye(N)*self.likelihood.variance - Ksu.dot(wi).dot(Ksu.T)
        Lc = jitchol(C)
        LcInva = dtrtrs(Lc, a)[0]
        LcInv = dtrtri(Lc)
        CInva = dtrtrs(Lc, LcInva,trans=1)[0]

        self._log_marginal_likelihood = -N*D/2.*np.log(2*np.pi) - D*np.log(np.diag(Lc)).sum() - np.square(LcInva).sum()/2.

        dKsu = CInva.dot(wv.T)
        dKss = tdot(CInva)/2. -D* tdot(LcInv.T)/2.
        dKsu += -2. * dKss.dot(Ksu).dot(wi)
        
        X_grad = self.kern.gradients_X(dKss, self.X)
        X_grad += self.kern.gradients_X(dKsu, self.X, self.Z)
        self.X.gradient = X_grad      
        
        if self.uncertain_input:
            # Update Log-likelihood
            KL_div = self.variational_prior.KL_divergence(self.X)
            # update for the KL divergence
            self.variational_prior.update_gradients_KL(self.X)
            self._log_marginal_likelihood += -KL_div
示例#2
0
文件: util_tests.py 项目: ysekky/GPy
    def test_checkFullRank(self):
        from GPy.util.debug import checkFullRank
        from GPy.util.linalg import tdot
        array = np.random.normal(0, 1, 100).reshape(25, 4)
        self.assertFalse(checkFullRank(tdot(array), name='test'))

        array = np.random.normal(0, 1, (25, 25))
        self.assertTrue(checkFullRank(tdot(array)))
示例#3
0
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None, A = None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        The comments below corresponds to Alg 2.1 in GPML textbook.
        """
        # print('ExactGaussianInferenceGroup inference:')
        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y-m

        # NOTE: change K to AKA^T
        if K is None:
            if A is None:
                A = np.identity(X.shape[0])
            K = A.dot(kern.K(X)).dot(A.T) # A_t k(X_t, X_t) A_t^T
        else:
            raise NotImplementedError('Need to be extended to group case!')
            

        Ky = K.copy()
        diag.add(Ky, variance+1e-8) # A_t k(X_t, X_t)A_t^T + sigma^2 I

        # pdinv: 
        # Wi: inverse of Ky
        # LW: the Cholesky decomposition of Ky -> L
        # LWi: the Cholesky decomposition of Kyi (not used)
        # W_logdet: the log of the determinat of Ky
        Wi, LW, LWi, W_logdet = pdinv(Ky) 

        # LAPACK: DPOTRS solves a system of linear equations A*X = B with a symmetric
        # positive definite matrix A using the Cholesky factorization
        # A = U**T*U or A = L*L**T computed by DPOTRF.
        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
        # so this gives 
        # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m)

        # Note: 20210827 confirm the log marginal likelihood 
        log_marginal =  0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        # REVIEW: since log_marginal does not change, the gradient does not need to change as well.
        # FIXME: confirm the gradient update is correct
        # dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)
        dL_dK = 0.5 * A.T.dot((tdot(alpha) - Y.shape[1] * Wi)).dot(A)
        # print('dL_dK shape', dL_dK.shape)

        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata)

        return PosteriorExactGroup(woodbury_chol=LW, woodbury_vector=alpha, K=K, A = A), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
示例#4
0
 def _create_kernel(self, V):
     self._kerns = [
         RBF(1, ARD=True, active_dims=[i]) for i in range(self.n_dims)
     ]
     self._kernf = Fixed(self.n_dims, tdot(V))
     self._kernb = Bias(self.n_dims)
     self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
示例#5
0
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            K_B_inv = pdinv(K + B_inv)[0]
            alpha = np.dot(K_B_inv, self.Y)
            dL_dB = tdot(alpha) - K_B_inv

            for n in range(self.phi.shape[0]):
                grad_B_inv_nonzero = -self.variance / (self.phi[n, i] ** 2 + 1e-6)
                grad_Lm[n, i] = 0.5 * dL_dB[n, n] * grad_B_inv_nonzero

        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
示例#6
0
    def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, D, missing_data):

        assert beta.size == 1

        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X)*beta
            psi2 = kern.psi2(Z, X)*beta if not missing_data else kern.psi2n(Z, X)*beta
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if missing_data:
                psi2 = psi1[:,None,:]*psi1[:,:,None]*beta
            else:
                psi2 = tdot(psi1.T)*beta
            psi1 = psi1*beta
            
        if isinstance(Y, VariationalPosterior):
            m, s = Y.mean, Y.variance
            psi1Y = np.dot(m.T,psi1) # DxM            
            YRY = (np.square(m).sum()+s.sum())*beta
            psi0 = (D*psi0).sum()*beta
        elif missing_data:
            psi1Y = np.dot((Y).T,psi1) # DxM
            trYYT = self.get_trYYT(Y)
            YRY = trYYT*beta
            psi0 = (psi0*D).sum()*beta
        else:
            psi1Y = np.dot(Y.T,psi1) # DxM
            trYYT = self.get_trYYT(Y)
            YRY = trYYT*beta
            psi0 = (psi0*D).sum()*beta
            
        return psi0, psi2, YRY, psi1, psi1Y
示例#7
0
def _inference(K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float,
               y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]]) -> Tuple[Posterior, int, Dict]:
    """
    Compute the posterior approximation
    :param K: prior covariance matrix
    :param ga_approx: Gaussian approximation of the batches
    :param cav_params: Cavity parameters of the posterior
    :param Z_tilde: Log marginal likelihood
    :param y: Direct observations as a list of tuples telling location index (row in X) and observation value.
    :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index)
    :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary
    """
    
    log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde,y,yc)
    tau_tilde_root = sqrtm_block(ga_approx.tau, y,yc)
    Sroot_tilde_K = np.dot(tau_tilde_root, K)
    aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
    alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
    LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1)

    Wi = np.dot(LWi.T,LWi)
    symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
    dL_dK = 0.5 * (tdot(alpha) - Wi)
    dL_dthetaL = 0
    return Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
示例#8
0
文件: OMGP.py 项目: fagan2888/GPclust
    def __init__(self,
                 X,
                 Y,
                 K=2,
                 kernels=None,
                 variance=1.,
                 alpha=1.,
                 prior_Z='symmetric',
                 name='OMGP'):

        N, self.D = Y.shape
        self.Y = Y
        self.YYT = tdot(self.Y)

        self.X = X

        if kernels == None:
            self.kern = []
            for i in range(K):
                self.kern.append(GPy.kern.RBF(input_dim=1))
        else:
            self.kern = kernels

        CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name)

        self.link_parameter(
            GPy.core.parameterization.param.Param(
                'variance', variance,
                GPy.core.parameterization.transformations.Logexp()))
        self.link_parameters(*self.kern)
示例#9
0
文件: OMGP.py 项目: fagan2888/GPclust
    def vb_grad_natgrad(self):
        """
        Natural Gradients of the bound with respect to phi, the variational
        parameters controlling assignment of the data to GPs
        """
        grad_Lm = np.zeros_like(self.phi)
        for i, kern in enumerate(self.kern):
            K = kern.K(self.X)
            I = np.eye(self.N)

            B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance))
            alpha = np.linalg.solve(K + B_inv, self.Y)
            K_B_inv = pdinv(K + B_inv)[0]
            dL_dB = tdot(alpha) - K_B_inv

            for n in range(self.phi.shape[0]):
                grad_B_inv = np.zeros_like(B_inv)
                grad_B_inv[n, n] = -self.variance / (self.phi[n, i]**2 + 1e-6)
                grad_Lm[n, i] = 0.5 * np.trace(np.dot(dL_dB, grad_B_inv))

        grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad

        natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None]
        grad = natgrad * self.phi

        return grad.flatten(), natgrad.flatten()
示例#10
0
    def inference(self,
                  kern,
                  X,
                  W,
                  likelihood,
                  Y,
                  mean_function=None,
                  Y_metadata=None,
                  K=None,
                  variance=None,
                  Z_tilde=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """

        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y - m

        if K is None:
            K = kern.K(X)

        Ky = K.copy()

        diag.add(Ky, variance + 1e-8)

        Wi, LW, LWi, W_logdet = pdinv(Ky)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)

        log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet -
                              np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)

        dL_dthetaL = likelihood.exact_inference_gradients(
            np.diag(dL_dK), Y_metadata)

        posterior_ = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K)

        return posterior_, log_marginal, {
            'dL_dK': dL_dK,
            'dL_dthetaL': dL_dthetaL,
            'dL_dm': alpha
        }, W_logdet
示例#11
0
 def compute_dL(self):
     # Common computation
     beta = 1./np.fmax(self.likelihood.variance, 1e-6)
     output_dim = self.Y.shape[-1]
     wv = self.posterior.woodbury_vector
     if self.missing_data:
         wv = wv[:,self.valid_dim]
         output_dim = self.valid_dim.sum()
         if self.ninan is not None:
             self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - tdot(wv)[:, :, None]).sum(-1)
         else:
             self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - tdot(wv))
         self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
         self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
     else:
         self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - tdot(wv))/2. #np.einsum('md,od->mo',wv, wv)
         self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
         self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
示例#12
0
 def K(self, X, X2=None):
     if self.ARD:
         if X2 is None:
             return tdot(X * np.sqrt(self.variance))
         else:
             rv = np.sqrt(self.variance)
             return np.dot(X * rv, (X2 * rv).T)
     else:
         return self._dot_product(X, X2) * self.variance
示例#13
0
    def comp_KL_qU(self, qU_mean ,qU_var):
        M,D = qU_mean.shape[0], qU_mean.shape[1]

        qU_L = self.mid['qU_L']
        L = self.mid['L']
        Linvmu = self.mid['Linvmu']
        LinvLu = self.mid['LinvLu']
        KuuInv = dpotri(L, lower=1)[0]
        
        Lu = qU_L
        LuInv = dtrtri(Lu)
        
        KL = D*M/-2. - np.log(np.diag(Lu)).sum()*D +np.log(np.diag(L)).sum()*D + np.square(LinvLu).sum()/2.*D + np.square(Linvmu).sum()/2.
        
        dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] 
        dKL_dqU_var = (tdot(LuInv.T)/-2. +  KuuInv/2.)*D
        dKL_dKuu = KuuInv*D/2. -KuuInv.dot( tdot(qU_mean)+qU_var*D).dot(KuuInv)/2.

        return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
示例#14
0
    def _get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LLT = YYT.

        Note that L may have fewer columns than Y.
        """
        N, D = Y.shape
        if (N>=D):
            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))
示例#15
0
    def gatherPsiStat(self, kern, X, Z, uncertain_inputs):

        if uncertain_inputs:
            psi0 = kern.psi0(Z, X).sum()
            psi1 = kern.psi1(Z, X)
            psi2 = kern.psi2(Z, X)
        else:
            psi0 = kern.Kdiag(X).sum()
            psi1 = kern.K(X, Z)
            psi2 = tdot(psi1.T)

        return psi0, psi1, psi2
示例#16
0
 def _raw_predict(self, kern, Xnew, A_ast, pred_var, full_cov=False):
     """
     pred_var: _predictive_variable, X_t (all X up to round t)
     """
     # print('PosteriorExactGroup _raw_predict')
     # NOTE: change Kx to AKx and add .dot(A_ast.T)
     # NOTE: 20210827 confirm mu and var (for self._woodbury_chol.ndim == 2 case)
     Kx = self.A.dot(kern.K(pred_var, Xnew)).dot(A_ast.T) # A_t k(X_t, X_\ast) A_\ast
     mu = np.dot(Kx.T, self.woodbury_vector) 
     # mu = A_\ast k(X_t, X_\ast)^T A_t^T (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} (Y_t - m)
     if len(mu.shape) == 1:
         mu = mu.reshape(-1, 1)
     if full_cov:
         Kxx = kern.K(Xnew) # k(X_ast, X_ast)
         # self._woodbury_chol Cholesky decomposition of A_t k(X_t, X_t)A_t^T + sigma^2 I
         if self._woodbury_chol.ndim == 2:
             # DTRTRS solves a triangular system of the form A * X = B  or  A**T * X = B, where A is a triangular matrix of order N, and B is an N-by-NRHS matrix.  A check is made to verify that A is nonsingular.
             tmp = dtrtrs(self._woodbury_chol, Kx)[0] # (A_t k(X_t, X_t)A_t^T + sigma^2 I)^{-1} k(X_ast, X_ast) -> v
             # tdot: returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles.
             var = A_ast.dot(Kxx - tdot(tmp.T)).dot(A_ast.T)
         elif self._woodbury_chol.ndim == 3:  # Missing data
             raise NotImplementedError('Need to be extended to group case!')
             var = np.empty((Kxx.shape[0], Kxx.shape[1], self._woodbury_chol.shape[2]))
             for i in range(var.shape[2]):
                 tmp = dtrtrs(self._woodbury_chol[:, :, i], Kx)[0]
                 var[:, :, i] = (Kxx - tdot(tmp.T))
         var = var
     else:
         Kxx = np.diag(A_ast.dot(kern.K(Xnew, Xnew)).dot(A_ast.T))
         if self._woodbury_chol.ndim == 2:
             tmp = dtrtrs(self._woodbury_chol, Kx)[0]
             var = (Kxx - np.square(tmp).sum(0))[:, None]
         elif self._woodbury_chol.ndim == 3:  # Missing data
             raise NotImplementedError('Need to be extended to group case!')
             var = np.empty((Kxx.shape[0], self._woodbury_chol.shape[2]))
             for i in range(var.shape[1]):
                 tmp = dtrtrs(self._woodbury_chol[:, :, i], Kx)[0]
                 var[:, i] = (Kxx - np.square(tmp).sum(0))
         var = var
     return mu, var
示例#17
0
    def comp_KL_qU(self, qU_mean, qU_var):
        M, D = qU_mean.shape[0], qU_mean.shape[1]

        qU_L = self.mid['qU_L']
        L = self.mid['L']
        Linvmu = self.mid['Linvmu']
        LinvLu = self.mid['LinvLu']
        KuuInv = dpotri(L, lower=1)[0]

        Lu = qU_L
        LuInv = dtrtri(Lu)

        KL = D * M / -2. - np.log(np.diag(Lu)).sum() * D + np.log(
            np.diag(L)).sum() * D + np.square(
                LinvLu).sum() / 2. * D + np.square(Linvmu).sum() / 2.

        dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0]
        dKL_dqU_var = (tdot(LuInv.T) / -2. + KuuInv / 2.) * D
        dKL_dKuu = KuuInv * D / 2. - KuuInv.dot(tdot(qU_mean) +
                                                qU_var * D).dot(KuuInv) / 2.

        return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
示例#18
0
def compute_dl_dK(posterior, K, eta, theta, prior_mean = 0):
    tau, v = theta, eta

    tau_tilde_root = np.sqrt(tau)
    Sroot_tilde_K = tau_tilde_root[:,None] * K
    aux_alpha , _ = dpotrs(posterior.L, np.dot(Sroot_tilde_K, v), lower=1)
    alpha = (v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
    LWi, _ = dtrtrs(posterior.L, np.diag(tau_tilde_root), lower=1)
    Wi = np.dot(LWi.T, LWi)
    symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)

    dL_dK = 0.5 * (tdot(alpha) - Wi)
    
    return dL_dK
示例#19
0
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
    log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde)

    tau_tilde_root = np.sqrt(ga_approx.tau)
    Sroot_tilde_K = tau_tilde_root[:,None] * K

    aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
    alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
    LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
    Wi = np.dot(LWi.T,LWi)
    symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)

    dL_dK = 0.5 * (tdot(alpha) - Wi)
    dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
    #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive')
    #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata)
    #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2))
    return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
示例#20
0
 def scaled_dist(self, X, X2=None):
     X = self.input_type.evaluate(X)
     if X2 is None:
         Xsq = np.sum(np.square(X), 1)
         r2 = -2. * tdot(X) + (Xsq[:, None] + Xsq[None, :])
         util.diag.view(
             r2
         )[:,
           ] = 0.  # force diagnoal to be zero: sometime numerically a little negative
         r2 = np.clip(r2, 0, np.inf)
         return np.sqrt(r2) / self.lengthscale
     else:
         #X2, = self._slice_X(X2)
         X2 = self.input_type.evaluate(X2)
         X1sq = np.sum(np.square(X), 1)
         X2sq = np.sum(np.square(X2), 1)
         r2 = -2. * np.dot(X, X2.T) + (X1sq[:, None] + X2sq[None, :])
         r2 = np.clip(r2, 0, np.inf)
         return np.sqrt(r2) / self.lengthscale
示例#21
0
    def __init__(self, X, Y, K=2, kernels=None, variance=1., alpha=1., prior_Z='symmetric', name='OMGP'):

        N, self.D = Y.shape
        self.Y = Y
        self.YYT = tdot(self.Y)

        self.X = X

        if kernels == None:
            self.kern = []
            for i in range(K):
                self.kern.append(GPy.kern.RBF(input_dim=1))
        else:
            self.kern = kernels

        CollapsedMixture.__init__(self, N, K, prior_Z, alpha, name)

        self.link_parameter(GPy.core.parameterization.param.Param('variance', variance, GPy.core.parameterization.transformations.Logexp()))
        self.link_parameters(*self.kern)
示例#22
0
 def _unscaled_dist(self, X, X2=None):
     """
     Compute the Euclidean distance between each row of X and X2, or between
     each pair of rows of X if X2 is None.
     """
     # X, = self._slice_X(X)
     if X2 is None:
         Xsq = np.sum(np.square(X), 1)
         r2 = -2. * tdot(X) + (Xsq[:, None] + Xsq[None, :])
         util.diag.view(
             r2
         )[:,
           ] = 0.  # force diagnoal to be zero: sometime numerically a little negative
         r2 = np.clip(r2, 0, np.inf)
         return np.sqrt(r2)
     else:
         # X2, = self._slice_X(X2)
         X1sq = np.sum(np.square(X), 1)
         X2sq = np.sum(np.square(X2), 1)
         r2 = -2. * np.dot(X, X2.T) + (X1sq[:, None] + X2sq[None, :])
         r2 = np.clip(r2, 0, np.inf)
         return np.sqrt(r2)
示例#23
0
    def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, D,
                      missing_data):

        assert beta.size == 1

        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X) * beta
            psi2 = kern.psi2(Z, X) * beta if not missing_data else kern.psi2n(
                Z, X) * beta
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if missing_data:
                psi2 = psi1[:, None, :] * psi1[:, :, None] * beta
            else:
                psi2 = tdot(psi1.T) * beta
            psi1 = psi1 * beta

        if isinstance(Y, VariationalPosterior):
            m, s = Y.mean, Y.variance
            psi1Y = np.dot(m.T, psi1)  # DxM
            YRY = (np.square(m).sum() + s.sum()) * beta
            psi0 = (D * psi0).sum() * beta
        elif missing_data:
            psi1Y = np.dot((Y).T, psi1)  # DxM
            trYYT = self.get_trYYT(Y)
            YRY = trYYT * beta
            psi0 = (psi0 * D).sum() * beta
        else:
            psi1Y = np.dot(Y.T, psi1)  # DxM
            trYYT = self.get_trYYT(Y)
            YRY = trYYT * beta
            psi0 = (psi0 * D).sum() * beta

        return psi0, psi2, YRY, psi1, psi1Y
示例#24
0
 def get_YYTfactor(self, Y):
     N, D = Y.shape
     if (N>=D):
         return Y.view(np.ndarray)
     else:
         return jitchol(tdot(Y))
示例#25
0
    def _inference_vardtc(self):
        if self.svi:
            from GPy.util.linalg import tdot
            self.qU_var = tdot(self.qU_W)+np.eye(self.Z.shape[0])*self.qU_a
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.qU_mean , self.qU_var, Kuu_sigma=self.Kuu_sigma)
            
            if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root):
                KL, dKL_dqU_mean, dKL_dqU_var, dKL_dKuu = self.inference_method.comp_KL_qU(self.qU_mean ,self.qU_var)
                self._log_marginal_likelihood += -KL*self.qU_ratio        
                self.grad_dict['dL_dqU_mean'] += -dKL_dqU_mean*self.qU_ratio
                self.grad_dict['dL_dqU_var'] += -dKL_dqU_var*self.qU_ratio
                self.grad_dict['dL_dKmm'] += -dKL_dKuu*self.qU_ratio
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, Kuu_sigma=self.Kuu_sigma if hasattr(self, 'Kuu_sigma') else None)

        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        dL_dKmm = self.grad_dict['dL_dKmm']
        if (self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root)) and (hasattr(self, 'Kuu_sigma') and self.Kuu_sigma is not None):
            self.Kuu_sigma.gradient = np.diag(dL_dKmm)

        if isinstance(self.X, VariationalPosterior):
            #gradients wrt kernel
            
            if self.psicov:
                self.kern.update_gradients_expectations_psicov(variational_posterior=self.X,
                                                        Z=self.Z,
                                                        dL_dpsi0=self.grad_dict['dL_dpsi0'],
                                                        dL_dpsi1=self.grad_dict['dL_dpsi1'],
                                                        dL_dpsicov=self.grad_dict['dL_dpsicov'])
            else:
                self.kern.update_gradients_expectations(variational_posterior=self.X,
                                                        Z=self.Z,
                                                        dL_dpsi0=self.grad_dict['dL_dpsi0'],
                                                        dL_dpsi1=self.grad_dict['dL_dpsi1'],
                                                        dL_dpsi2=self.grad_dict['dL_dpsi2'])
            kerngrad = self.kern.gradient.copy()
            if self.mpi_comm is None:
                self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                kerngrad += self.kern.gradient.copy()
                self.kern.gradient = kerngrad
            else:
                kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank==self.mpi_root:
                    self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                    kerngrad += self.kern.gradient.copy()
                    self.kern.gradient = kerngrad

            #gradients wrt Z
            if self.psicov:
                self.Z.gradient = self.kern.gradients_Z_expectations_psicov(
                                   self.grad_dict['dL_dpsi0'],
                                   self.grad_dict['dL_dpsi1'],
                                   self.grad_dict['dL_dpsicov'],
                                   Z=self.Z,
                                   variational_posterior=self.X)
            else:
                self.Z.gradient = self.kern.gradients_Z_expectations(
                                   self.grad_dict['dL_dpsi0'],
                                   self.grad_dict['dL_dpsi1'],
                                   self.grad_dict['dL_dpsi2'],
                                   Z=self.Z,
                                   variational_posterior=self.X)
            if self.mpi_comm is None:
                self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
            else:
                self.Z.gradient =  reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank == self.mpi_root:
                    self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
        else:
            #gradients wrt kernel
            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
            kerngrad = self.kern.gradient.copy()
            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
            kerngrad += self.kern.gradient
            if self.mpi_comm is None:
                self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                self.kern.gradient += kerngrad
            else:
                kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank==self.mpi_root:
                    self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                    kerngrad += self.kern.gradient.copy()
                    self.kern.gradient = kerngrad
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
            if self.mpi_comm is None:
                self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
            else:
                self.Z.gradient =  reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank == self.mpi_root:
                    self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)

        if self.svi:
            self.qU_mean.gradient = self.grad_dict['dL_dqU_mean']
            self.qU_W.gradient = (self.grad_dict['dL_dqU_var']+self.grad_dict['dL_dqU_var'].T).dot(self.qU_W)
            self.qU_a.gradient = np.diag(self.grad_dict['dL_dqU_var']).sum()
示例#26
0
    def inference(self, kern, X, Z, likelihood, Y, qU):
        """
        The SVI-VarDTC inference
        """

        if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)):
            missing_data = True
            N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1]
            Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1)
            Ymask = 1 - np.isnan(Y) * 1
            Y_masked = np.zeros_like(Y)
            Y_masked[Ymask == 1] = Y[Ymask == 1]
            ND = Ymask.sum()
        else:
            missing_data = False
            N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]
            ND = N * D

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(
            kern, X, Z, Y if not missing_data else Y_masked, beta,
            uncertain_inputs, D if not missing_data else Ds, missing_data)

        #======================================================================
        # Compute Common Components
        #======================================================================

        mu, S = qU.mean, qU.covariance
        mupsi1Y = mu.dot(psi1Y)

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        if missing_data:
            S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :]
            NS_mu = S_mu.T.dot(Ymask.T).T
            LmInv = dtrtri(Lm)

            LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T)
            LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T)

            B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0)
            tmp = backsub_both_sides(Lm, B, 'right')

            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.
        else:
            S_mu = S * D + tdot(mu)
            if uncertain_inputs:
                LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
            else:
                LmInvPsi2LmInvT = tdot(dtrtrs(
                    Lm, psi1.T)[0]) / beta  #tdot(psi1.dot(LmInv.T).T) /beta
            LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right')

            B = mupsi1Y + mupsi1Y.T + D * psi2
            tmp = backsub_both_sides(Lm, B, 'right')

            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = np.eye(M)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = None  #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        if missing_data:
            dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2.
        else:
            dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2.

        if uncertain_outputs:
            Ym, Ys = Y.mean, Y.variance
            dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm,
                                         Ym.dot(mu.T).T)[0],
                              trans=1)[0].T * beta
        else:
            if missing_data:
                dL_dpsi1 = dtrtrs(
                    Lm, dtrtrs(Lm,
                               (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta
            else:
                dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm,
                                             Y.dot(mu.T).T)[0],
                                  trans=1)[0].T * beta

        if uncertain_inputs:
            if missing_data:
                dL_dpsi2 = np.swapaxes(
                    (Ds[:, None, None] * np.eye(M)[None, :, :] -
                     LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2.
            else:
                dL_dpsi2 = beta * backsub_both_sides(
                    Lm,
                    D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2.
        else:
            dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T)
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            Ym = Y.mean
            grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot(
                dtrtrs(Lm, mu)[0])
            grad_dict['dL_dYvar'] = beta / -2.

        return logL, grad_dict
示例#27
0
    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape

        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size >1:
            raise NotImplementedError("no hetero noise with this implementation of PEP")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm

        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)

        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0))
        beta_star = 1./sigma_star

        # Compute and factor A
        A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)

        # back substitute to get b, P, v
        URiy = np.dot(U.T*beta_star,Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)

        alpha_const_term = (1.0-self.alpha) / self.alpha

        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n)
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
            + np.sum(np.square(Uv), 1))*beta_star**2 

        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        dL_dK = 0.5*(Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T)

        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= self.alpha * 2.*KiU*dL_dR

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}

        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)

        return post, log_marginal, grad_dict
示例#28
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  qU_mean,
                  qU_var,
                  Kuu_sigma=None):
        """
        The SVI-VarDTC inference
        """

        N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / likelihood.variance

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kuu, Kuu_sigma)
        else:
            diag.add(Kuu, self.const_jitter)
        Lm = jitchol(Kuu)

        mu, S = qU_mean, qU_var
        Ls = jitchol(S)
        LinvLs = dtrtrs(Lm, Ls)[0]
        Linvmu = dtrtrs(Lm, mu)[0]
        psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T

        self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu}

        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta

        LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu)

        #         logdet_L = np.sum(np.log(np.diag(Lm)))
        #         logdet_S = np.sum(np.log(np.diag(Ls)))

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL_R = -N * np.log(beta)
        logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2.  \
                     -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum()

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT),
                                  'left')
        tmp2 = Linvmu.dot(psi1YLinvT)
        tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T,
                                  'left') / 2.

        dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = -D * N * beta / 2. - (
            -D * psi0 / 2. - YRY / 2. -
            (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. +
            np.trace(LmInvPsi2LmInvT) * D / 2. +
            (Linvmu * psi1YLinvT.T).sum()) * beta

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left')
        dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0]
        dL_dqU_var = D / 2. * tmp1

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0]
        tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left')

        post = Posterior(woodbury_inv=tmp,
                         woodbury_vector=KuuInvmu,
                         K=Kuu,
                         mean=mu,
                         cov=S,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2.

        if uncertain_outputs:
            dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta
        else:
            dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta

        dL_dpsi2 = beta * backsub_both_sides(Lm,
                                             D * np.eye(M) - LmInvSmuLmInvT,
                                             'left') / 2.
        if not uncertain_inputs:
            dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL,
                'dL_dqU_mean': dL_dqU_mean,
                'dL_dqU_var': dL_dqU_var
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL,
                'dL_dqU_mean': dL_dqU_mean,
                'dL_dqU_var': dL_dqU_var
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot(
                dtrtrs(Lm, mu)[0])
            grad_dict['dL_dYvar'] = beta / -2.

        return post, logL, grad_dict
示例#29
0
 def _dot_product(self, X, X2=None):
     if X2 is None:
         return tdot(X)
     else:
         return np.dot(X, X2.T)
示例#30
0
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)

        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = self.get_YYTfactor(Y)
        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
        het_noise = beta.size > 1

        if het_noise:
            raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)"))

        if beta.ndim == 1:
            beta = beta[:, None]


        # do the inference:
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        if Lm is None:
            Lm = jitchol(Kmm)

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X)
            if het_noise:
                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
                psi2_beta = kern.psi2(Z,X) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp)

        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        # back substutue C into psi1Vf
        #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0)
        #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # data fit and derivative of L w.r.t. Kmm
        #delit = tdot(_LBi_Lmi_psi1Vf)

        # Expose YYT to get additional covariates in (YYT + Kgg):
        tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0)
        _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1)
        Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # TODO: cache this:
        # Compute fixed covariates covariance:
        if fixed_covs_kerns is not None:
            K_fixed = 0
            for name, [cov, k] in fixed_covs_kerns.iteritems():
                K_fixed += k.K(cov)

            #trYYT = self.get_trYYT(Y)
            YYT_covs = (tdot(Y) + K_fixed)
            data_term = beta**2 * YYT_covs
            trYYT_covs = np.trace(YYT_covs)
        else:
            data_term = beta**2 * tdot(Y)
            trYYT_covs = self.get_trYYT(Y)

        #trYYT = self.get_trYYT(Y)
        delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T)
        data_fit = np.trace(delit)

        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        if dL_dKmm is None:
            delit = -0.5 * DBi_plus_BiPBi
            delit += -0.5 * B * output_dim
            delit += output_dim * np.eye(num_inducing)
            # Compute dL_dKmm
            dL_dKmm = backsub_both_sides(Lm, delit)

        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            data_term, Cpsi1, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)

        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT_covs, data_fit, Y)

        if self.save_per_dim:
            self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta]

        # No heteroscedastics, so no _LBi_Lmi_psi1Vf:
        # For the interested reader, try implementing the heteroscedastic version, it should be possible
        _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was.

        #noise derivatives
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT_covs, Y, None)

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)

        #put the gradients in the right places
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}

        if fixed_covs_kerns is not None:
            # For now, we do not take the gradients, we can compute them,
            # but the maximum likelihood solution is to switch off the additional covariates....
            dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T)
            grad_dict['dL_dcovs'] = -.5 * dL_dcovs

        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if 1:
            woodbury_vector = (beta*Cpsi1).dot(Y)
        else:
            import ipdb; ipdb.set_trace()
            psi1V = np.dot(Y.T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)

        woodbury_inv = backsub_both_sides(Lm, Bi)

        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
    def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean,
                  qU_var_r, qU_var_c, indexD, output_dim):
        """
        The SVI-VarDTC inference
        """

        N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim, Zr.shape[0], Zc.shape[
            0], Zr.shape[1], Zc.shape[1]

        uncertain_inputs_r = isinstance(Xr, VariationalPosterior)
        uncertain_inputs_c = isinstance(Xc, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        grad_dict = self._init_grad_dict(N, D, Mr, Mc)

        beta = 1. / likelihood.variance
        if len(beta) == 1:
            beta = np.zeros(D) + beta

        psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr,
                                                    uncertain_inputs_r)
        psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc,
                                                    uncertain_inputs_c)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu_r = kern_r.K(Zr).copy()
        diag.add(Kuu_r, self.const_jitter)
        Lr = jitchol(Kuu_r)

        Kuu_c = kern_c.K(Zc).copy()
        diag.add(Kuu_c, self.const_jitter)
        Lc = jitchol(Kuu_c)

        mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c
        LSr = jitchol(Sr)
        LSc = jitchol(Sc)

        LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0]
        LcInvLSc = dtrtrs(Lc, LSc)[0]
        LrInvLSr = dtrtrs(Lr, LSr)[0]
        LcInvScLcInvT = tdot(LcInvLSc)
        LrInvSrLrInvT = tdot(LrInvLSr)
        tr_LrInvSrLrInvT = np.square(LrInvLSr).sum()
        tr_LcInvScLcInvT = np.square(LcInvLSc).sum()

        mid_res = {
            'psi0_r': psi0_r,
            'psi1_r': psi1_r,
            'psi2_r': psi2_r,
            'psi0_c': psi0_c,
            'psi1_c': psi1_c,
            'psi2_c': psi2_c,
            'Lr': Lr,
            'Lc': Lc,
            'LcInvMLrInvT': LcInvMLrInvT,
            'LcInvScLcInvT': LcInvScLcInvT,
            'LrInvSrLrInvT': LrInvSrLrInvT,
        }

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL = 0.
        for d in range(D):
            logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res,
                                     uncertain_inputs_r, uncertain_inputs_c,
                                     Mr, Mc)

        logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum())  -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \
               - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2.

        #======================================================================
        # Compute dL_dKuu
        #======================================================================

        tmp = tdot(
            LcInvMLrInvT
        ) / 2. + tr_LrInvSrLrInvT / 2. * LcInvScLcInvT - Mr / 2. * np.eye(Mc)

        dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left')
        dL_dKuu_c += dL_dKuu_c.T
        dL_dKuu_c *= 0.5

        tmp = tdot(
            LcInvMLrInvT.T
        ) / 2. + tr_LcInvScLcInvT / 2. * LrInvSrLrInvT - Mc / 2. * np.eye(Mr)

        dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left')
        dL_dKuu_r += dL_dKuu_r.T
        dL_dKuu_r *= 0.5

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp = -LcInvMLrInvT
        dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0]

        LScInv = dtrtri(LSc)
        tmp = -tr_LrInvSrLrInvT / 2. * np.eye(Mc)
        dL_dqU_var_c = backsub_both_sides(Lc, tmp,
                                          'left') + tdot(LScInv.T) * Mr / 2.

        LSrInv = dtrtri(LSr)
        tmp = -tr_LcInvScLcInvT / 2. * np.eye(Mr)
        dL_dqU_var_r = backsub_both_sides(Lr, tmp,
                                          'left') + tdot(LSrInv.T) * Mc / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT,
                                    LcInvScLcInvT=LcInvScLcInvT,
                                    LrInvSrLrInvT=LrInvSrLrInvT,
                                    Lr=Lr,
                                    Lc=Lc,
                                    kern_r=kern_r,
                                    Xr=Xr,
                                    Zr=Zr)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        grad_dict['dL_dqU_mean'] += dL_dqU_mean
        grad_dict['dL_dqU_var_c'] += dL_dqU_var_c
        grad_dict['dL_dqU_var_r'] += dL_dqU_var_r
        grad_dict['dL_dKuu_c'] += dL_dKuu_c
        grad_dict['dL_dKuu_r'] += dL_dKuu_r

        if not uncertain_inputs_c:
            grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c']
            grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c']

        if not uncertain_inputs_r:
            grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r']
            grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r']

        return post, logL, grad_dict
示例#32
0
    def _inference_vardtc(self):
        if self.svi:
            from GPy.util.linalg import tdot
            self.qU_var = tdot(self.qU_W)+np.eye(self.Z.shape[0])*self.qU_a
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.qU_mean , self.qU_var, Kuu_sigma=self.Kuu_sigma)
            
            if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root):
                KL, dKL_dqU_mean, dKL_dqU_var, dKL_dKuu = self.inference_method.comp_KL_qU(self.qU_mean ,self.qU_var)
                self._log_marginal_likelihood += -KL*self.qU_ratio        
                self.grad_dict['dL_dqU_mean'] += -dKL_dqU_mean*self.qU_ratio
                self.grad_dict['dL_dqU_var'] += -dKL_dqU_var*self.qU_ratio
                self.grad_dict['dL_dKmm'] += -dKL_dKuu*self.qU_ratio
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, Kuu_sigma=self.Kuu_sigma)

        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        dL_dKmm = self.grad_dict['dL_dKmm']
        if self.mpi_comm is None or (self.mpi_comm is not None and self.mpi_comm.rank==self.mpi_root):
            self.Kuu_sigma.gradient = np.diag(dL_dKmm)

        if isinstance(self.X, VariationalPosterior):
            #gradients wrt kernel
            
            if self.psicov:
                self.kern.update_gradients_expectations_psicov(variational_posterior=self.X,
                                                        Z=self.Z,
                                                        dL_dpsi0=self.grad_dict['dL_dpsi0'],
                                                        dL_dpsi1=self.grad_dict['dL_dpsi1'],
                                                        dL_dpsicov=self.grad_dict['dL_dpsicov'])
            else:
                self.kern.update_gradients_expectations(variational_posterior=self.X,
                                                        Z=self.Z,
                                                        dL_dpsi0=self.grad_dict['dL_dpsi0'],
                                                        dL_dpsi1=self.grad_dict['dL_dpsi1'],
                                                        dL_dpsi2=self.grad_dict['dL_dpsi2'])
            kerngrad = self.kern.gradient.copy()
            if self.mpi_comm is None:
                self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                kerngrad += self.kern.gradient.copy()
                self.kern.gradient = kerngrad
            else:
                kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank==self.mpi_root:
                    self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                    kerngrad += self.kern.gradient.copy()
                    self.kern.gradient = kerngrad

            #gradients wrt Z
            if self.psicov:
                self.Z.gradient = self.kern.gradients_Z_expectations_psicov(
                                   self.grad_dict['dL_dpsi0'],
                                   self.grad_dict['dL_dpsi1'],
                                   self.grad_dict['dL_dpsicov'],
                                   Z=self.Z,
                                   variational_posterior=self.X)
            else:
                self.Z.gradient = self.kern.gradients_Z_expectations(
                                   self.grad_dict['dL_dpsi0'],
                                   self.grad_dict['dL_dpsi1'],
                                   self.grad_dict['dL_dpsi2'],
                                   Z=self.Z,
                                   variational_posterior=self.X)
            if self.mpi_comm is None:
                self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
            else:
                self.Z.gradient =  reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank == self.mpi_root:
                    self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
        else:
            #gradients wrt kernel
            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
            kerngrad = self.kern.gradient.copy()
            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
            kerngrad += self.kern.gradient
            if self.mpi_comm is None:
                self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                self.kern.gradient += kerngrad
            else:
                kerngrad = reduceArrays([kerngrad], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank==self.mpi_root:
                    self.kern.update_gradients_full(dL_dKmm, self.Z, None)
                    kerngrad += self.kern.gradient.copy()
                    self.kern.gradient = kerngrad
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
            if self.mpi_comm is None:
                self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)
            else:
                self.Z.gradient =  reduceArrays([self.Z.gradient], self.mpi_comm, self.mpi_root)[0]
                if self.mpi_comm.rank == self.mpi_root:
                    self.Z.gradient += self.kern.gradients_X(dL_dKmm, self.Z)

        if self.svi:
            self.qU_mean.gradient = self.grad_dict['dL_dqU_mean']
            self.qU_W.gradient = (self.grad_dict['dL_dqU_var']+self.grad_dict['dL_dqU_var'].T).dot(self.qU_W)
            self.qU_a.gradient = np.diag(self.grad_dict['dL_dqU_var']).sum()
示例#33
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  Y_metadata=None,
                  Lm=None,
                  dL_dKmm=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        #LmInv = dtrtri(Lm)
        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(
                Lm, psi1.T)[0]) / beta  #tdot(psi1.dot(LmInv.T).T) /beta

        Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LmLL = Lm.dot(LL)
        #        LLInv = dtrtri(LL)
        #       LmLLInv = LLInv.dot(LmInv)

        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        b = dtrtrs(LmLL, psi1Y.T)[0].T  #psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = dtrtrs(LmLL, b.T, trans=1)[0].T  #b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

        if psi1S is not None:
            psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T  #psi1S.dot(LmLLInv.T)
            bbt += np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T)
            psi1SP = dtrtrs(LmLL, psi1SLLinv.T,
                            trans=1)[0].T  #psi1SLLinv.dot(LmLLInv)
        tmp = -backsub_both_sides(
            LL, LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim))
        dL_dpsi2R = backsub_both_sides(
            Lm, tmp + output_dim * np.eye(input_dim)) / 2
        #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
        #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        logL_R = -num_data * np.log(beta)
        logL = -(
            output_dim *
            (num_data * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) +
            YRY - bbt) / 2. - output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = dL_dpsi2R - output_dim * backsub_both_sides(
            Lm,
            LmInvPsi2LmInvT) / 2  #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=v.T,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data *
                      output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum(
                      ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2.

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP)
        else:
            dL_dpsi1 = beta * np.dot(Y, v)

        if uncertain_inputs:
            dL_dpsi2 = beta * dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2.
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T  #psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum(
                axis=1) / 2

        return post, logL, grad_dict
示例#34
0
    def inference(self, kern, X, Z, likelihood, Y, qU_mean ,qU_var, Kuu_sigma=None):
        """
        The SVI-VarDTC inference
        """

        N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1./likelihood.variance

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)
        
        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kuu, Kuu_sigma)
        else:
            diag.add(Kuu, self.const_jitter)
        Lm = jitchol(Kuu)
        
        mu, S = qU_mean, qU_var
        Ls = jitchol(S)
        LinvLs = dtrtrs(Lm, Ls)[0]
        Linvmu = dtrtrs(Lm, mu)[0]
        psi1YLinvT = dtrtrs(Lm,psi1Y.T)[0].T
        
        self.mid = {
                    'qU_L': Ls,
                    'LinvLu': LinvLs,
                    'L':Lm,
                    'Linvmu': Linvmu}
        
        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta 
        
        LmInvSmuLmInvT = tdot(LinvLs)*D+tdot(Linvmu)
        
#         logdet_L = np.sum(np.log(np.diag(Lm)))
#         logdet_S = np.sum(np.log(np.diag(Ls)))
        
        #======================================================================
        # Compute log-likelihood
        #======================================================================
        
        logL_R = -N*np.log(beta)
        logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2.  \
                     -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum()
                
        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        tmp1 = backsub_both_sides(Lm,LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left')
        tmp2 = Linvmu.dot(psi1YLinvT)
        tmp3 = backsub_both_sides(Lm,  - D*LmInvPsi2LmInvT  -tmp2-tmp2.T, 'left')/2.

        dL_dKmm = (tmp1+tmp1.T)/2. + tmp3

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = -D*N*beta/2. -(- D*psi0/2. - YRY/2.-(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum())*beta
        
        #======================================================================
        # Compute dL_dqU
        #======================================================================
        
        tmp1 = backsub_both_sides(Lm, - LmInvPsi2LmInvT, 'left')
        dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T,trans=1)[0]
        dL_dqU_var = D/2.*tmp1
        
        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0]
        tmp = backsub_both_sides(Lm,  np.eye(M) - tdot(LinvLs), 'left')

        post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm)
        
        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -D * (beta * np.ones((N,)))/2.

        if uncertain_outputs:
            dL_dpsi1 = Y.mean.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta
        else:
            dL_dpsi1 = Y.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta

        dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2.
        if not uncertain_inputs:
            dL_dpsi1 += psi1.dot(dL_dpsi2+dL_dpsi2.T)/beta
            dL_dpsi2 = None
            
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL,
                         'dL_dqU_mean':dL_dqU_mean,
                         'dL_dqU_var':dL_dqU_var}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL,
                         'dL_dqU_mean':dL_dqU_mean,
                         'dL_dqU_var':dL_dqU_var}

        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            grad_dict['dL_dYmean'] = -m*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0])
            grad_dict['dL_dYvar'] = beta/-2.

        return post, logL, grad_dict
示例#35
0
    def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]
        num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1./np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        try:
            Kmm = kern.K(Z).copy()
            if Kuu_sigma is not None:
                diag.add(Kmm, Kuu_sigma)
            else:
                diag.add(Kmm, self.const_jitter)
            Lm = jitchol(Kmm)
    
            LmInv = dtrtri(Lm)
            LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T))
                
            Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
            LL = jitchol(Lambda)        
            LLInv = dtrtri(LL)
            flag = np.zeros((1,),dtype=np.int32)
            self.mpi_comm.Bcast(flag,root=self.root)
        except LinAlgError as e:
            flag = np.ones((1,),dtype=np.int32)
            self.mpi_comm.Bcast(flag,root=self.root)
            raise e
            
        broadcastArrays([LmInv, LLInv],self.mpi_comm,  self.root)
        LmLLInv = LLInv.dot(LmInv)
        
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
        b  = psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
        
        if psi1S is not None:
            psi1SLLinv = psi1S.dot(LmLLInv.T)
            bbt_sum = np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T)
            bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays([bbt_sum,  LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root)
            bbt += bbt_sum
            LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum
            psi1SP = psi1SLLinv.dot(LmLLInv)
        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
        dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
        broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root)

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        logL_R = -num_data_total*np.log(beta)
        logL = -(output_dim*(num_data_total*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm =  dL_dpsi2R - output_dim* LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
        post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data_total*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT)
        
        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2.

        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP)
        else:
            dL_dpsi1 = beta*np.dot(Y,v)

        if uncertain_inputs:
            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
        
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
            
        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            psi1LmiLLi = psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2

        return post, logL, grad_dict
示例#36
0
    def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean,
                  qU_var_r, qU_var_c):
        """
        The SVI-VarDTC inference
        """

        N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[
            0], Zr.shape[1], Zc.shape[1]

        uncertain_inputs_r = isinstance(Xr, VariationalPosterior)
        uncertain_inputs_c = isinstance(Xc, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / likelihood.variance

        psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr,
                                                    uncertain_inputs_r)
        psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc,
                                                    uncertain_inputs_c)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu_r = kern_r.K(Zr).copy()
        diag.add(Kuu_r, self.const_jitter)
        Lr = jitchol(Kuu_r)

        Kuu_c = kern_c.K(Zc).copy()
        diag.add(Kuu_c, self.const_jitter)
        Lc = jitchol(Kuu_c)

        mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c
        LSr = jitchol(Sr)
        LSc = jitchol(Sc)

        LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0]
        LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right')
        LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right')
        LcInvLSc = dtrtrs(Lc, LSc)[0]
        LrInvLSr = dtrtrs(Lr, LSr)[0]
        LcInvScLcInvT = tdot(LcInvLSc)
        LrInvSrLrInvT = tdot(LrInvLSr)
        LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0]
        LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0]

        tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT *
                                              LrInvSrLrInvT).sum()
        tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT *
                                              LcInvScLcInvT).sum()
        tr_LrInvSrLrInvT = np.square(LrInvLSr).sum()
        tr_LcInvScLcInvT = np.square(LcInvLSc).sum()
        tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT)
        tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT)

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL_A = - np.square(Y).sum() \
               - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \
               -  tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \
               + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \
               + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT

        logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \
               -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum())  -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \
               - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2.

        #======================================================================
        # Compute dL_dKuu
        #======================================================================

        tmp =  beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \
             + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \
             - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \
             - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \
             + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT

        dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left')
        dL_dKuu_c += dL_dKuu_c.T
        dL_dKuu_c *= 0.5

        tmp =  beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \
             + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \
             - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \
             - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \
             + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT

        dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left')
        dL_dKuu_r += dL_dKuu_r.T
        dL_dKuu_r *= 0.5

        #======================================================================
        # Compute dL_dthetaL
        #======================================================================

        dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2.

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\
              + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT

        dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0]

        LScInv = dtrtri(LSc)
        tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye(
            Mc)
        dL_dqU_var_c = backsub_both_sides(Lc, tmp,
                                          'left') + tdot(LScInv.T) * Mr / 2.

        LSrInv = dtrtri(LSr)
        tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye(
            Mr)
        dL_dqU_var_r = backsub_both_sides(Lr, tmp,
                                          'left') + tdot(LSrInv.T) * Mc / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT,
                                    LcInvScLcInvT=LcInvScLcInvT,
                                    LrInvSrLrInvT=LrInvSrLrInvT,
                                    Lr=Lr,
                                    Lc=Lc,
                                    kern_r=kern_r,
                                    Xr=Xr,
                                    Zr=Zr)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, ))
        dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, ))

        dL_dpsi1_c = beta * dtrtrs(
            Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T
        dL_dpsi1_r = beta * dtrtrs(
            Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T

        tmp = beta / 2. * (
            -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) -
            tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT +
            tr_LrInvPsi2_rLrInvT * np.eye(Mc))
        dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left')
        tmp = beta / 2. * (
            -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) -
            tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT +
            tr_LcInvPsi2_cLcInvT * np.eye(Mr))
        dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left')

        if not uncertain_inputs_r:
            dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T)
        if not uncertain_inputs_c:
            dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T)

        grad_dict = {
            'dL_dthetaL': dL_dthetaL,
            'dL_dqU_mean': dL_dqU_mean,
            'dL_dqU_var_c': dL_dqU_var_c,
            'dL_dqU_var_r': dL_dqU_var_r,
            'dL_dKuu_c': dL_dKuu_c,
            'dL_dKuu_r': dL_dKuu_r,
        }

        if uncertain_inputs_c:
            grad_dict['dL_dpsi0_c'] = dL_dpsi0_c
            grad_dict['dL_dpsi1_c'] = dL_dpsi1_c
            grad_dict['dL_dpsi2_c'] = dL_dpsi2_c
        else:
            grad_dict['dL_dKdiag_c'] = dL_dpsi0_c
            grad_dict['dL_dKfu_c'] = dL_dpsi1_c

        if uncertain_inputs_r:
            grad_dict['dL_dpsi0_r'] = dL_dpsi0_r
            grad_dict['dL_dpsi1_r'] = dL_dpsi1_r
            grad_dict['dL_dpsi2_r'] = dL_dpsi2_r
        else:
            grad_dict['dL_dKdiag_r'] = dL_dpsi0_r
            grad_dict['dL_dKfu_r'] = dL_dpsi1_r

        return post, logL, grad_dict
示例#37
0
    def incremental_inference(self,
                              kern,
                              X,
                              likelihood,
                              Y,
                              mean_function=None,
                              Y_metadata=None,
                              K=None,
                              variance=None,
                              Z_tilde=None):

        # do incremental update
        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)

        if variance is None:
            variance = likelihood.gaussian_variance(Y_metadata)

        YYT_factor = Y - m

        # K_tmp = kern.K(X, X[-1:])
        K_inc = kern._K[:-1, -1]
        K_inc2 = kern._K[-1:, -1]
        # self._K = np.block([[self._K, K_inc], [K_inc.T, K_inc2]])

        # Ky = K.copy()
        jitter = variance[
            -1] + 1e-8  # variance can be given for each point individually, in which case we just take the last point
        # diag.add(Ky, jitter)

        # LW_old = self._old_posterior.woodbury_chol

        Wi, LW, LWi, W_logdet = pdinv_inc(self._old_LW, K_inc, K_inc2 + jitter,
                                          self._old_Wi)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)

        log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet -
                              np.sum(alpha * YYT_factor))

        if Z_tilde is not None:
            # This is a correction term for the log marginal likelihood
            # In EP this is log Z_tilde, which is the difference between the
            # Gaussian marginal and Z_EP
            log_marginal += Z_tilde

        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)

        dL_dthetaL = likelihood.exact_inference_gradients(
            np.diag(dL_dK), Y_metadata)

        self._old_LW = LW
        self._old_Wi = Wi
        posterior = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K)

        # TODO add logdet to posterior ?
        return posterior, log_marginal, {
            'dL_dK': dL_dK,
            'dL_dthetaL': dL_dthetaL,
            'dL_dm': alpha
        }
 def get_YYTfactor(self, Y):
     N, D = Y.shape
     if (N >= D):
         return Y.view(np.ndarray)
     else:
         return jitchol(tdot(Y))
示例#39
0
    def inference(self, kern, X, Z, likelihood, Y, qU):
        """
        The SVI-VarDTC inference
        """

        if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)):
            missing_data = True
            N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1]
            Ds = Y.shape[1] - (np.isnan(Y)*1).sum(1)
            Ymask = 1-np.isnan(Y)*1
            Y_masked = np.zeros_like(Y)
            Y_masked[Ymask==1] = Y[Ymask==1]
            ND = Ymask.sum()
        else:
            missing_data = False
            N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]
            ND = N*D

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1./np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data)
        
        #======================================================================
        # Compute Common Components
        #======================================================================
        
        mu, S = qU.mean, qU.covariance
        mupsi1Y = mu.dot(psi1Y)

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
        
        if missing_data:
            S_mu = S[None,:,:]+mu.T[:,:,None]*mu.T[:,None,:]
            NS_mu = S_mu.T.dot(Ymask.T).T
            LmInv = dtrtri(Lm)
            
            LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T),1,2).dot(LmInv.T)            
            LmInvSmuLmInvT =  np.swapaxes(NS_mu.dot(LmInv.T),1,2).dot(LmInv.T)
            
            B = mupsi1Y+ mupsi1Y.T +(Ds[:,None,None]*psi2).sum(0)
            tmp = backsub_both_sides(Lm, B,'right')
            
            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.
        else:
            S_mu = S*D+tdot(mu)
            if uncertain_inputs:
                LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
            else:
                LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta        
            LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right')
            
            B = mupsi1Y+ mupsi1Y.T +D*psi2
            tmp = backsub_both_sides(Lm, B,'right')
            
            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = np.eye(M)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT)
        
        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        if missing_data:
            dL_dpsi0 = -Ds * (beta * np.ones((N,)))/2.
        else:
            dL_dpsi0 = -D * (beta * np.ones((N,)))/2.

        if uncertain_outputs:
            Ym,Ys = Y.mean, Y.variance
            dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T*beta
        else:
            if missing_data:
                dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T*beta
            else:
                dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T*beta

        if uncertain_inputs:
            if missing_data:
                dL_dpsi2 = np.swapaxes((Ds[:,None,None]*np.eye(M)[None,:,:]-LmInvSmuLmInvT).dot(LmInv),1,2).dot(LmInv)*beta/2.
            else:
                dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2.
        else:
            dL_dpsi1 += beta*psi1.dot(dL_dpsi2+dL_dpsi2.T) 
            dL_dpsi2 = None
            
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}

        if uncertain_outputs:
            Ym = Y.mean
            grad_dict['dL_dYmean'] = -Ym*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0])
            grad_dict['dL_dYvar'] = beta/-2.

        return logL, grad_dict
示例#40
0
 def _create_kernel(self, V):
     self._kerns = [RBF(1, ARD=True, active_dims=[i])
                    for i in range(self.n_dims)]
     self._kernf = Fixed(self.n_dims, tdot(V))
     self._kernb = Bias(self.n_dims)
     self.kernel = np.sum(self._kerns) + self._kernf + self._kernb
示例#41
0
    def inference(self,
                  kern,
                  X,
                  Z,
                  likelihood,
                  Y,
                  indexD,
                  output_dim,
                  Y_metadata=None,
                  Lm=None,
                  dL_dKmm=None,
                  Kuu_sigma=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        input_dim = Z.shape[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)

        beta = 1. / likelihood.variance
        if len(beta) == 1:
            beta = np.zeros(output_dim) + beta

        beta_exp = np.zeros(indexD.shape[0])
        for d in range(output_dim):
            beta_exp[indexD == d] = beta[d]

        psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta,
                                              uncertain_inputs)

        psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kmm, Kuu_sigma)
        else:
            diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        logL = 0.
        dL_dthetaL = np.zeros(output_dim)
        dL_dKmm = np.zeros_like(Kmm)
        dL_dpsi0 = np.zeros_like(psi0)
        dL_dpsi1 = np.zeros_like(psi1)
        dL_dpsi2 = np.zeros_like(psi2)
        wv = np.empty((Kmm.shape[0], output_dim))

        for d in range(output_dim):
            idx_d = indexD == d
            Y_d = Y[idx_d]
            N_d = Y_d.shape[0]
            beta_d = beta[d]

            psi2_d = psi2[idx_d].sum(0) * beta_d
            psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d
            psi0_d = psi0[idx_d].sum() * beta_d
            YRY_d = np.square(Y_d).sum() * beta_d

            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right')

            Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
            LL = jitchol(Lambda)
            LmLL = Lm.dot(LL)

            b = dtrtrs(LmLL, psi1Y.T)[0].T
            bbt = np.square(b).sum()
            v = dtrtrs(LmLL, b.T, trans=1)[0].T
            LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

            tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT)
            dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2

            logL_R = -N_d * np.log(beta_d)
            logL += -((N_d * log_2_pi + logL_R + psi0_d -
                       np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2.

            dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2

            dL_dthetaL[d:d +
                       1] = (YRY_d * beta_d + beta_d * psi0_d - N_d *
                             beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum(
                             ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT)

            dL_dpsi0[idx_d] = -beta_d / 2.
            dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v)
            dL_dpsi2[idx_d] = beta_d * dL_dpsi2R
            wv[:, d] = v

        LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right')

        Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LmLL = Lm.dot(LL)
        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        dL_dpsi2R_common = dpotri(LmLL)[0] / -2.
        dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None]

        for d in range(output_dim):
            dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0)
                              ).sum() * -beta[d] * beta[d]

        dL_dKmm += dL_dpsi2R_common * output_dim

        logL += -output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        # dL_dKmm =  dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        LLInvLmT = dtrtrs(LL, Lm.T)[0]
        cov = tdot(LLInvLmT.T)

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=wv,
                         K=Kmm,
                         mean=None,
                         cov=cov,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        # for d in range(output_dim):
        #     dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum()
        # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum()

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        if not uncertain_inputs:
            dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2.

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        return post, logL, grad_dict
示例#42
0
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """


        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        # from ..models.sslvm import Gaussian_Gamma
        # if isinstance(likelihood, Gaussian_Gamma):
        #     beta = likelihood.expectation_beta()
        #     logL_R = -num_data*likelihood.expectation_logbeta()
        # else:
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        logL_R = -num_data*np.log(beta)


        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        if Kuu_sigma is not None:
            diag.add(Kmm, Kuu_sigma)
        else:
            diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        #LmInv = dtrtri(Lm)
        if uncertain_inputs:
            LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
        else:
            LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta
            
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LmLL = Lm.dot(LL)
#        LLInv = dtrtri(LL)
 #       LmLLInv = LLInv.dot(LmInv)
        
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
        b  = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
        
        if psi1S is not None:
            psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T)
            bbt += np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T)
            psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv)
        tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim))
        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim))/2
        #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
        #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
        
        #======================================================================
        # Compute log-likelihood
        #======================================================================
        
        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm =  dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        LLInvLmT = dtrtrs(LL, Lm.T)[0]
        cov = tdot(LLInvLmT.T)

        wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
        post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=cov, K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        # if isinstance(likelihood, Gaussian_Gamma):
        #     from scipy.special import polygamma
        #     dL_dthetaL = ((YRY + output_dim*psi0)/2. - (dL_dpsi2R*psi2).sum() - np.trace(LLinvPsi1TYYTPsi1LLinvT))/-beta
        #     likelihood.q_a.gradient = num_data*output_dim/2.*polygamma(1, likelihood.q_a) + dL_dthetaL/likelihood.q_b
        #     likelihood.q_b.gradient = num_data*output_dim/(-2.*likelihood.q_b) +dL_dthetaL*(-likelihood.q_a/(likelihood.q_b*likelihood.q_b))
        # else:
        dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT)
        
        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2.

        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP)
        else:
            dL_dpsi1 = beta*np.dot(Y,v)

        if uncertain_inputs:
            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
        
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
            
        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T 
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2

        return post, logL, grad_dict
示例#43
0
    def inference_nonroot(self, kern, X, Z, likelihood, Y,Y_metadata=None, Lm=None, dL_dKmm=None):
        
        num_data, output_dim = Y.shape
        num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0]
        
        input_dim = Z.shape[0]
        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        
        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)

        flag = np.zeros((1,),dtype=np.int32)
        self.mpi_comm.Bcast(flag,root=self.root)
        if flag[0] == 1: raise LinAlgError('Linalg error!')

        LmInv, LLInv = np.empty((input_dim, input_dim)).T, np.empty((input_dim, input_dim)).T
        broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root)
        LmLLInv = LLInv.dot(LmInv)
        b  = psi1Y.dot(LmLLInv.T)
        v = b.dot(LmLLInv)
        
        if psi1S is not None:
            psi1SLLinv = psi1S.dot(LmLLInv.T)
            bbt_sum = np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T)
            reduceArrays([bbt_sum,  LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root)
            psi1SP = psi1SLLinv.dot(LmLLInv)
        
        dL_dpsi2R = np.empty((input_dim, input_dim))
        broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root)
            
        dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2.

        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP)
        else:
            dL_dpsi1 = beta*np.dot(Y,v)

        if uncertain_inputs:
            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
        
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': None,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':None}
        else:
            grad_dict = {'dL_dKmm': None,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':None}
        if uncertain_outputs:
            m,s = Y.mean, Y.variance
            psi1LmiLLi = psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2
        
        return None, 0, grad_dict