Пример #1
0
    def parameters_changed(self):
        N, D = self.Y.shape

        Kss = self.kern.K(self.X)
        Ksu = self.kern.K(self.X, self.Z)

        wv = self.posterior.woodbury_vector
        wi = self.posterior.woodbury_inv
        
        a = self.Y - Ksu.dot(wv)
        
        C = Kss  + np.eye(N)*self.likelihood.variance - Ksu.dot(wi).dot(Ksu.T)
        Lc = jitchol(C)
        LcInva = dtrtrs(Lc, a)[0]
        LcInv = dtrtri(Lc)
        CInva = dtrtrs(Lc, LcInva,trans=1)[0]

        self._log_marginal_likelihood = -N*D/2.*np.log(2*np.pi) - D*np.log(np.diag(Lc)).sum() - np.square(LcInva).sum()/2.

        dKsu = CInva.dot(wv.T)
        dKss = tdot(CInva)/2. -D* tdot(LcInv.T)/2.
        dKsu += -2. * dKss.dot(Ksu).dot(wi)
        
        X_grad = self.kern.gradients_X(dKss, self.X)
        X_grad += self.kern.gradients_X(dKsu, self.X, self.Z)
        self.X.gradient = X_grad      
        
        if self.uncertain_input:
            # Update Log-likelihood
            KL_div = self.variational_prior.KL_divergence(self.X)
            # update for the KL divergence
            self.variational_prior.update_gradients_KL(self.X)
            self._log_marginal_likelihood += -KL_div
Пример #2
0
    def comp_KL_qU(self, qU_mean, qU_var):
        M, D = qU_mean.shape[0], qU_mean.shape[1]

        qU_L = self.mid['qU_L']
        L = self.mid['L']
        Linvmu = self.mid['Linvmu']
        LinvLu = self.mid['LinvLu']
        KuuInv = dpotri(L, lower=1)[0]

        Lu = qU_L
        LuInv = dtrtri(Lu)

        KL = D * M / -2. - np.log(np.diag(Lu)).sum() * D + np.log(
            np.diag(L)).sum() * D + np.square(
                LinvLu).sum() / 2. * D + np.square(Linvmu).sum() / 2.

        dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0]
        dKL_dqU_var = (tdot(LuInv.T) / -2. + KuuInv / 2.) * D
        dKL_dKuu = KuuInv * D / 2. - KuuInv.dot(tdot(qU_mean) +
                                                qU_var * D).dot(KuuInv) / 2.

        return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
    def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean,
                  qU_var_r, qU_var_c, indexD, output_dim):
        """
        The SVI-VarDTC inference
        """

        N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim, Zr.shape[0], Zc.shape[
            0], Zr.shape[1], Zc.shape[1]

        uncertain_inputs_r = isinstance(Xr, VariationalPosterior)
        uncertain_inputs_c = isinstance(Xc, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        grad_dict = self._init_grad_dict(N, D, Mr, Mc)

        beta = 1. / likelihood.variance
        if len(beta) == 1:
            beta = np.zeros(D) + beta

        psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr,
                                                    uncertain_inputs_r)
        psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc,
                                                    uncertain_inputs_c)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu_r = kern_r.K(Zr).copy()
        diag.add(Kuu_r, self.const_jitter)
        Lr = jitchol(Kuu_r)

        Kuu_c = kern_c.K(Zc).copy()
        diag.add(Kuu_c, self.const_jitter)
        Lc = jitchol(Kuu_c)

        mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c
        LSr = jitchol(Sr)
        LSc = jitchol(Sc)

        LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0]
        LcInvLSc = dtrtrs(Lc, LSc)[0]
        LrInvLSr = dtrtrs(Lr, LSr)[0]
        LcInvScLcInvT = tdot(LcInvLSc)
        LrInvSrLrInvT = tdot(LrInvLSr)
        tr_LrInvSrLrInvT = np.square(LrInvLSr).sum()
        tr_LcInvScLcInvT = np.square(LcInvLSc).sum()

        mid_res = {
            'psi0_r': psi0_r,
            'psi1_r': psi1_r,
            'psi2_r': psi2_r,
            'psi0_c': psi0_c,
            'psi1_c': psi1_c,
            'psi2_c': psi2_c,
            'Lr': Lr,
            'Lc': Lc,
            'LcInvMLrInvT': LcInvMLrInvT,
            'LcInvScLcInvT': LcInvScLcInvT,
            'LrInvSrLrInvT': LrInvSrLrInvT,
        }

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL = 0.
        for d in range(D):
            logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res,
                                     uncertain_inputs_r, uncertain_inputs_c,
                                     Mr, Mc)

        logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum())  -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \
               - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2.

        #======================================================================
        # Compute dL_dKuu
        #======================================================================

        tmp = tdot(
            LcInvMLrInvT
        ) / 2. + tr_LrInvSrLrInvT / 2. * LcInvScLcInvT - Mr / 2. * np.eye(Mc)

        dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left')
        dL_dKuu_c += dL_dKuu_c.T
        dL_dKuu_c *= 0.5

        tmp = tdot(
            LcInvMLrInvT.T
        ) / 2. + tr_LcInvScLcInvT / 2. * LrInvSrLrInvT - Mc / 2. * np.eye(Mr)

        dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left')
        dL_dKuu_r += dL_dKuu_r.T
        dL_dKuu_r *= 0.5

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp = -LcInvMLrInvT
        dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0]

        LScInv = dtrtri(LSc)
        tmp = -tr_LrInvSrLrInvT / 2. * np.eye(Mc)
        dL_dqU_var_c = backsub_both_sides(Lc, tmp,
                                          'left') + tdot(LScInv.T) * Mr / 2.

        LSrInv = dtrtri(LSr)
        tmp = -tr_LcInvScLcInvT / 2. * np.eye(Mr)
        dL_dqU_var_r = backsub_both_sides(Lr, tmp,
                                          'left') + tdot(LSrInv.T) * Mc / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT,
                                    LcInvScLcInvT=LcInvScLcInvT,
                                    LrInvSrLrInvT=LrInvSrLrInvT,
                                    Lr=Lr,
                                    Lc=Lc,
                                    kern_r=kern_r,
                                    Xr=Xr,
                                    Zr=Zr)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        grad_dict['dL_dqU_mean'] += dL_dqU_mean
        grad_dict['dL_dqU_var_c'] += dL_dqU_var_c
        grad_dict['dL_dqU_var_r'] += dL_dqU_var_r
        grad_dict['dL_dKuu_c'] += dL_dKuu_c
        grad_dict['dL_dKuu_r'] += dL_dKuu_r

        if not uncertain_inputs_c:
            grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c']
            grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c']

        if not uncertain_inputs_r:
            grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r']
            grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r']

        return post, logL, grad_dict
Пример #4
0
    def inference(self, kern, X, Z, likelihood, Y, qU):
        """
        The SVI-VarDTC inference
        """

        if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)):
            missing_data = True
            N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1]
            Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1)
            Ymask = 1 - np.isnan(Y) * 1
            Y_masked = np.zeros_like(Y)
            Y_masked[Ymask == 1] = Y[Ymask == 1]
            ND = Ymask.sum()
        else:
            missing_data = False
            N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1]
            ND = N * D

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(
            kern, X, Z, Y if not missing_data else Y_masked, beta,
            uncertain_inputs, D if not missing_data else Ds, missing_data)

        #======================================================================
        # Compute Common Components
        #======================================================================

        mu, S = qU.mean, qU.covariance
        mupsi1Y = mu.dot(psi1Y)

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)

        if missing_data:
            S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :]
            NS_mu = S_mu.T.dot(Ymask.T).T
            LmInv = dtrtri(Lm)

            LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T)
            LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T)

            B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0)
            tmp = backsub_both_sides(Lm, B, 'right')

            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.
        else:
            S_mu = S * D + tdot(mu)
            if uncertain_inputs:
                LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right')
            else:
                LmInvPsi2LmInvT = tdot(dtrtrs(
                    Lm, psi1.T)[0]) / beta  #tdot(psi1.dot(LmInv.T).T) /beta
            LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right')

            B = mupsi1Y + mupsi1Y.T + D * psi2
            tmp = backsub_both_sides(Lm, B, 'right')

            logL =  -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2.  \
                       -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = np.eye(M)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = None  #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        if missing_data:
            dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2.
        else:
            dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2.

        if uncertain_outputs:
            Ym, Ys = Y.mean, Y.variance
            dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm,
                                         Ym.dot(mu.T).T)[0],
                              trans=1)[0].T * beta
        else:
            if missing_data:
                dL_dpsi1 = dtrtrs(
                    Lm, dtrtrs(Lm,
                               (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta
            else:
                dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm,
                                             Y.dot(mu.T).T)[0],
                                  trans=1)[0].T * beta

        if uncertain_inputs:
            if missing_data:
                dL_dpsi2 = np.swapaxes(
                    (Ds[:, None, None] * np.eye(M)[None, :, :] -
                     LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2.
            else:
                dL_dpsi2 = beta * backsub_both_sides(
                    Lm,
                    D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2.
        else:
            dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T)
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            Ym = Y.mean
            grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot(
                dtrtrs(Lm, mu)[0])
            grad_dict['dL_dYvar'] = beta / -2.

        return logL, grad_dict
Пример #5
0
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)

        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = self.get_YYTfactor(Y)
        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
        het_noise = beta.size > 1

        if het_noise:
            raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)"))

        if beta.ndim == 1:
            beta = beta[:, None]


        # do the inference:
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        if Lm is None:
            Lm = jitchol(Kmm)

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X)
            if het_noise:
                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
                psi2_beta = kern.psi2(Z,X) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp)

        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        # back substutue C into psi1Vf
        #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0)
        #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # data fit and derivative of L w.r.t. Kmm
        #delit = tdot(_LBi_Lmi_psi1Vf)

        # Expose YYT to get additional covariates in (YYT + Kgg):
        tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0)
        _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1)
        Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1)

        # TODO: cache this:
        # Compute fixed covariates covariance:
        if fixed_covs_kerns is not None:
            K_fixed = 0
            for name, [cov, k] in fixed_covs_kerns.iteritems():
                K_fixed += k.K(cov)

            #trYYT = self.get_trYYT(Y)
            YYT_covs = (tdot(Y) + K_fixed)
            data_term = beta**2 * YYT_covs
            trYYT_covs = np.trace(YYT_covs)
        else:
            data_term = beta**2 * tdot(Y)
            trYYT_covs = self.get_trYYT(Y)

        #trYYT = self.get_trYYT(Y)
        delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T)
        data_fit = np.trace(delit)

        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        if dL_dKmm is None:
            delit = -0.5 * DBi_plus_BiPBi
            delit += -0.5 * B * output_dim
            delit += output_dim * np.eye(num_inducing)
            # Compute dL_dKmm
            dL_dKmm = backsub_both_sides(Lm, delit)

        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            data_term, Cpsi1, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)

        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT_covs, data_fit, Y)

        if self.save_per_dim:
            self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta]

        # No heteroscedastics, so no _LBi_Lmi_psi1Vf:
        # For the interested reader, try implementing the heteroscedastic version, it should be possible
        _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was.

        #noise derivatives
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT_covs, Y, None)

        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)

        #put the gradients in the right places
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}

        if fixed_covs_kerns is not None:
            # For now, we do not take the gradients, we can compute them,
            # but the maximum likelihood solution is to switch off the additional covariates....
            dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T)
            grad_dict['dL_dcovs'] = -.5 * dL_dcovs

        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if 1:
            woodbury_vector = (beta*Cpsi1).dot(Y)
        else:
            import ipdb; ipdb.set_trace()
            psi1V = np.dot(Y.T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)

        woodbury_inv = backsub_both_sides(Lm, Bi)

        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
Пример #6
0
    def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean,
                  qU_var_r, qU_var_c):
        """
        The SVI-VarDTC inference
        """

        N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[
            0], Zr.shape[1], Zc.shape[1]

        uncertain_inputs_r = isinstance(Xr, VariationalPosterior)
        uncertain_inputs_c = isinstance(Xc, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / likelihood.variance

        psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr,
                                                    uncertain_inputs_r)
        psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc,
                                                    uncertain_inputs_c)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kuu_r = kern_r.K(Zr).copy()
        diag.add(Kuu_r, self.const_jitter)
        Lr = jitchol(Kuu_r)

        Kuu_c = kern_c.K(Zc).copy()
        diag.add(Kuu_c, self.const_jitter)
        Lc = jitchol(Kuu_c)

        mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c
        LSr = jitchol(Sr)
        LSc = jitchol(Sc)

        LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0]
        LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right')
        LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right')
        LcInvLSc = dtrtrs(Lc, LSc)[0]
        LrInvLSr = dtrtrs(Lr, LSr)[0]
        LcInvScLcInvT = tdot(LcInvLSc)
        LrInvSrLrInvT = tdot(LrInvLSr)
        LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0]
        LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0]

        tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT *
                                              LrInvSrLrInvT).sum()
        tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT *
                                              LcInvScLcInvT).sum()
        tr_LrInvSrLrInvT = np.square(LrInvLSr).sum()
        tr_LcInvScLcInvT = np.square(LcInvLSc).sum()
        tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT)
        tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT)

        #======================================================================
        # Compute log-likelihood
        #======================================================================

        logL_A = - np.square(Y).sum() \
               - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \
               -  tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \
               + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \
               + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT

        logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \
               -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum())  -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \
               - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2.

        #======================================================================
        # Compute dL_dKuu
        #======================================================================

        tmp =  beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \
             + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \
             - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \
             - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \
             + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT

        dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left')
        dL_dKuu_c += dL_dKuu_c.T
        dL_dKuu_c *= 0.5

        tmp =  beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \
             + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \
             - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \
             - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \
             + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT

        dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left')
        dL_dKuu_r += dL_dKuu_r.T
        dL_dKuu_r *= 0.5

        #======================================================================
        # Compute dL_dthetaL
        #======================================================================

        dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2.

        #======================================================================
        # Compute dL_dqU
        #======================================================================

        tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\
              + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT

        dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0]

        LScInv = dtrtri(LSc)
        tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye(
            Mc)
        dL_dqU_var_c = backsub_both_sides(Lc, tmp,
                                          'left') + tdot(LScInv.T) * Mr / 2.

        LSrInv = dtrtri(LSr)
        tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye(
            Mr)
        dL_dqU_var_r = backsub_both_sides(Lr, tmp,
                                          'left') + tdot(LSrInv.T) * Mc / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT,
                                    LcInvScLcInvT=LcInvScLcInvT,
                                    LrInvSrLrInvT=LrInvSrLrInvT,
                                    Lr=Lr,
                                    Lc=Lc,
                                    kern_r=kern_r,
                                    Xr=Xr,
                                    Zr=Zr)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, ))
        dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, ))

        dL_dpsi1_c = beta * dtrtrs(
            Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T
        dL_dpsi1_r = beta * dtrtrs(
            Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T

        tmp = beta / 2. * (
            -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) -
            tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT +
            tr_LrInvPsi2_rLrInvT * np.eye(Mc))
        dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left')
        tmp = beta / 2. * (
            -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) -
            tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT +
            tr_LcInvPsi2_cLcInvT * np.eye(Mr))
        dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left')

        if not uncertain_inputs_r:
            dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T)
        if not uncertain_inputs_c:
            dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T)

        grad_dict = {
            'dL_dthetaL': dL_dthetaL,
            'dL_dqU_mean': dL_dqU_mean,
            'dL_dqU_var_c': dL_dqU_var_c,
            'dL_dqU_var_r': dL_dqU_var_r,
            'dL_dKuu_c': dL_dKuu_c,
            'dL_dKuu_r': dL_dKuu_r,
        }

        if uncertain_inputs_c:
            grad_dict['dL_dpsi0_c'] = dL_dpsi0_c
            grad_dict['dL_dpsi1_c'] = dL_dpsi1_c
            grad_dict['dL_dpsi2_c'] = dL_dpsi2_c
        else:
            grad_dict['dL_dKdiag_c'] = dL_dpsi0_c
            grad_dict['dL_dKfu_c'] = dL_dpsi1_c

        if uncertain_inputs_r:
            grad_dict['dL_dpsi0_r'] = dL_dpsi0_r
            grad_dict['dL_dpsi1_r'] = dL_dpsi1_r
            grad_dict['dL_dpsi2_r'] = dL_dpsi2_r
        else:
            grad_dict['dL_dKdiag_r'] = dL_dpsi0_r
            grad_dict['dL_dKfu_r'] = dL_dpsi1_r

        return post, logL, grad_dict
Пример #7
0
    def inference_root(self,
                       kern,
                       X,
                       Z,
                       likelihood,
                       Y,
                       Kuu_sigma=None,
                       Y_metadata=None,
                       Lm=None,
                       dL_dKmm=None):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """

        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]
        num_data_total = allReduceArrays([np.int32(num_data)],
                                         self.mpi_comm)[0]

        uncertain_inputs = isinstance(X, VariationalPosterior)
        uncertain_outputs = isinstance(Y, VariationalPosterior)

        beta = 1. / np.fmax(likelihood.variance, 1e-6)

        psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(
            kern, X, Z, Y, beta, uncertain_inputs)

        #======================================================================
        # Compute Common Components
        #======================================================================

        try:
            Kmm = kern.K(Z).copy()
            if Kuu_sigma is not None:
                diag.add(Kmm, Kuu_sigma)
            else:
                diag.add(Kmm, self.const_jitter)
            Lm = jitchol(Kmm)

            LmInv = dtrtri(Lm)
            LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T))

            Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT
            LL = jitchol(Lambda)
            LLInv = dtrtri(LL)
            flag = np.zeros((1, ), dtype=np.int32)
            self.mpi_comm.Bcast(flag, root=self.root)
        except LinAlgError as e:
            flag = np.ones((1, ), dtype=np.int32)
            self.mpi_comm.Bcast(flag, root=self.root)
            raise e

        broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root)
        LmLLInv = LLInv.dot(LmInv)

        logdet_L = 2. * np.sum(np.log(np.diag(LL)))
        b = psi1Y.dot(LmLLInv.T)
        bbt = np.square(b).sum()
        v = b.dot(LmLLInv)
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

        if psi1S is not None:
            psi1SLLinv = psi1S.dot(LmLLInv.T)
            bbt_sum = np.square(psi1SLLinv).sum()
            LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T)
            bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays(
                [bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm,
                self.root)
            bbt += bbt_sum
            LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum
            psi1SP = psi1SLLinv.dot(LmLLInv)
        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT +
                           output_dim * np.eye(input_dim)).dot(LLInv)
        dL_dpsi2R = LmInv.T.dot(tmp +
                                output_dim * np.eye(input_dim)).dot(LmInv) / 2.
        broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root)

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        logL_R = -num_data_total * np.log(beta)
        logL = -(output_dim * (num_data_total * log_2_pi + logL_R + psi0 -
                               np.trace(LmInvPsi2LmInvT)) + YRY -
                 bbt) / 2. - output_dim * logdet_L / 2.

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = dL_dpsi2R - output_dim * LmInv.T.dot(LmInvPsi2LmInvT).dot(
            LmInv) / 2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        wd_inv = backsub_both_sides(
            Lm,
            np.eye(input_dim) -
            backsub_both_sides(LL, np.identity(input_dim), transpose='left'),
            transpose='left')
        post = Posterior(woodbury_inv=wd_inv,
                         woodbury_vector=v.T,
                         K=Kmm,
                         mean=None,
                         cov=None,
                         K_chol=Lm)

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data_total *
                      output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum(
                      ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT)

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2.

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP)
        else:
            dL_dpsi1 = beta * np.dot(Y, v)

        if uncertain_inputs:
            dL_dpsi2 = beta * dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2.
            dL_dpsi2 = None

        if uncertain_inputs:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dpsi0': dL_dpsi0,
                'dL_dpsi1': dL_dpsi1,
                'dL_dpsi2': dL_dpsi2,
                'dL_dthetaL': dL_dthetaL
            }
        else:
            grad_dict = {
                'dL_dKmm': dL_dKmm,
                'dL_dKdiag': dL_dpsi0,
                'dL_dKnm': dL_dpsi1,
                'dL_dthetaL': dL_dthetaL
            }

        if uncertain_outputs:
            m, s = Y.mean, Y.variance
            psi1LmiLLi = psi1.dot(LmLLInv.T)
            LLiLmipsi1Y = b.T
            grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y)
            grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum(
                axis=1) / 2

        return post, logL, grad_dict