def parameters_changed(self): N, D = self.Y.shape Kss = self.kern.K(self.X) Ksu = self.kern.K(self.X, self.Z) wv = self.posterior.woodbury_vector wi = self.posterior.woodbury_inv a = self.Y - Ksu.dot(wv) C = Kss + np.eye(N)*self.likelihood.variance - Ksu.dot(wi).dot(Ksu.T) Lc = jitchol(C) LcInva = dtrtrs(Lc, a)[0] LcInv = dtrtri(Lc) CInva = dtrtrs(Lc, LcInva,trans=1)[0] self._log_marginal_likelihood = -N*D/2.*np.log(2*np.pi) - D*np.log(np.diag(Lc)).sum() - np.square(LcInva).sum()/2. dKsu = CInva.dot(wv.T) dKss = tdot(CInva)/2. -D* tdot(LcInv.T)/2. dKsu += -2. * dKss.dot(Ksu).dot(wi) X_grad = self.kern.gradients_X(dKss, self.X) X_grad += self.kern.gradients_X(dKsu, self.X, self.Z) self.X.gradient = X_grad if self.uncertain_input: # Update Log-likelihood KL_div = self.variational_prior.KL_divergence(self.X) # update for the KL divergence self.variational_prior.update_gradients_KL(self.X) self._log_marginal_likelihood += -KL_div
def comp_KL_qU(self, qU_mean, qU_var): M, D = qU_mean.shape[0], qU_mean.shape[1] qU_L = self.mid['qU_L'] L = self.mid['L'] Linvmu = self.mid['Linvmu'] LinvLu = self.mid['LinvLu'] KuuInv = dpotri(L, lower=1)[0] Lu = qU_L LuInv = dtrtri(Lu) KL = D * M / -2. - np.log(np.diag(Lu)).sum() * D + np.log( np.diag(L)).sum() * D + np.square( LinvLu).sum() / 2. * D + np.square(Linvmu).sum() / 2. dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] dKL_dqU_var = (tdot(LuInv.T) / -2. + KuuInv / 2.) * D dKL_dKuu = KuuInv * D / 2. - KuuInv.dot(tdot(qU_mean) + qU_var * D).dot(KuuInv) / 2. return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c, indexD, output_dim): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim, Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) grad_dict = self._init_grad_dict(N, D, Mr, Mc) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(D) + beta psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() mid_res = { 'psi0_r': psi0_r, 'psi1_r': psi1_r, 'psi2_r': psi2_r, 'psi0_c': psi0_c, 'psi1_c': psi1_c, 'psi2_c': psi2_c, 'Lr': Lr, 'Lc': Lc, 'LcInvMLrInvT': LcInvMLrInvT, 'LcInvScLcInvT': LcInvScLcInvT, 'LrInvSrLrInvT': LrInvSrLrInvT, } #====================================================================== # Compute log-likelihood #====================================================================== logL = 0. for d in range(D): logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res, uncertain_inputs_r, uncertain_inputs_c, Mr, Mc) logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = tdot( LcInvMLrInvT ) / 2. + tr_LrInvSrLrInvT / 2. * LcInvScLcInvT - Mr / 2. * np.eye(Mc) dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = tdot( LcInvMLrInvT.T ) / 2. + tr_LcInvScLcInvT / 2. * LrInvSrLrInvT - Mc / 2. * np.eye(Mr) dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dqU #====================================================================== tmp = -LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -tr_LrInvSrLrInvT / 2. * np.eye(Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -tr_LcInvScLcInvT / 2. * np.eye(Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== grad_dict['dL_dqU_mean'] += dL_dqU_mean grad_dict['dL_dqU_var_c'] += dL_dqU_var_c grad_dict['dL_dqU_var_r'] += dL_dqU_var_r grad_dict['dL_dKuu_c'] += dL_dKuu_c grad_dict['dL_dKuu_r'] += dL_dKuu_r if not uncertain_inputs_c: grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c'] grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c'] if not uncertain_inputs_r: grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r'] grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r'] return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1) Ymask = 1 - np.isnan(Y) * 1 Y_masked = np.zeros_like(Y) Y_masked[Ymask == 1] = Y[Ymask == 1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N * D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T) B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0) tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S * D + tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y + mupsi1Y.T + D * psi2 tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2. else: dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: Ym, Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T * beta else: if missing_data: dL_dpsi1 = dtrtrs( Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T * beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes( (Ds[:, None, None] * np.eye(M)[None, :, :] - LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2. else: dL_dpsi2 = beta * backsub_both_sides( Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. else: dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right') LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right') LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT * LrInvSrLrInvT).sum() tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT * LcInvScLcInvT).sum() tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) #====================================================================== # Compute log-likelihood #====================================================================== logL_A = - np.square(Y).sum() \ - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \ -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \ + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \ + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dthetaL #====================================================================== dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2. #====================================================================== # Compute dL_dqU #====================================================================== tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye( Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye( Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, )) dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, )) dL_dpsi1_c = beta * dtrtrs( Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T dL_dpsi1_r = beta * dtrtrs( Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T tmp = beta / 2. * ( -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + tr_LrInvPsi2_rLrInvT * np.eye(Mc)) dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') tmp = beta / 2. * ( -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + tr_LcInvPsi2_cLcInvT * np.eye(Mr)) dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') if not uncertain_inputs_r: dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T) if not uncertain_inputs_c: dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T) grad_dict = { 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var_c': dL_dqU_var_c, 'dL_dqU_var_r': dL_dqU_var_r, 'dL_dKuu_c': dL_dKuu_c, 'dL_dKuu_r': dL_dKuu_r, } if uncertain_inputs_c: grad_dict['dL_dpsi0_c'] = dL_dpsi0_c grad_dict['dL_dpsi1_c'] = dL_dpsi1_c grad_dict['dL_dpsi2_c'] = dL_dpsi2_c else: grad_dict['dL_dKdiag_c'] = dL_dpsi0_c grad_dict['dL_dKfu_c'] = dL_dpsi1_c if uncertain_inputs_r: grad_dict['dL_dpsi0_r'] = dL_dpsi0_r grad_dict['dL_dpsi1_r'] = dL_dpsi1_r grad_dict['dL_dpsi2_r'] = dL_dpsi2_r else: grad_dict['dL_dKdiag_r'] = dL_dpsi0_r grad_dict['dL_dKfu_r'] = dL_dpsi1_r return post, logL, grad_dict
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) except LinAlgError as e: flag = np.ones((1, ), dtype=np.int32) self.mpi_comm.Bcast(flag, root=self.root) raise e broadcastArrays([LmInv, LLInv], self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays( [bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp + output_dim * np.eye(input_dim)).dot(LmInv) / 2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total * np.log(beta) logL = -(output_dim * (num_data_total * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * LmInv.T.dot(LmInvPsi2LmInvT).dot( LmInv) / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data_total * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict