def latent_function_covKuu(Z, B, kernel_list, kernel_list_Gdj, kff_aux): """ Builds the cross-covariance Kudud= cov[u_d(x),u_d(x)] of a Convolved Multi-output GP :param Z: Inducing points :param B: Coregionalization matrix :param kernel_list: Kernels of u_q functions :param kernel_list_Gdj: Kernel smoothing functions G(x) :param kff_aux is the kernel that solves the convolution integral between G(x) and kern_uq :return: Kuu """ J = len(kernel_list_Gdj) M, Dz = Z.shape Xdim = int(Dz / J) # Kuu = np.zeros([Q*M,Q*M]) Kuu = np.zeros((J, M, M)) Luu = np.empty((J, M, M)) Kuui = np.empty((J, M, M)) for j in range(J): for q, B_q in enumerate(B): update_conv_Kff(kernel_list[q], kernel_list_Gdj[j], kff_aux) Kuu[j, :, :] += B_q.B[j, j] * kff_aux.K( Z[:, j * Xdim:j * Xdim + Xdim], Z[:, j * Xdim:j * Xdim + Xdim]) Luu[j, :, :] = linalg.jitchol(Kuu[j, :, :], maxtries=10) Kuui[j, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[j, :, :])) return Kuu, Luu, Kuui
def build_covariance(t, K, hyperparams): ls = hyperparams[0] a0 = hyperparams[1] a = hyperparams[2] b = hyperparams[3] C, _ = a.shape # number of Fourier coefficients T, _ = t.shape S = np.empty((T, T, K)) L = np.empty((T, T, K)) Si = np.empty((T, T, K)) Diag, _ = build_diagonal(t, hyperparams) for k in range(K): # falta meter el término periódic hyperparam_k_list = [ls[0, k], a0[0, k], a[:, k], b[:, k]] per_term = fourier_series(t, T, C, hyperparam_k_list) s = per_term**2 per_S = s * s.T E = periodic_exponential(t, T, hyperparam_k_list) S[:, :, k] = per_S * E S[:, :, k] += Diag L[:, :, k] = linalg.jitchol(S[:, :, k]) Si[:, :, k], _ = linalg.dpotri(np.asfortranarray(L[:, :, k])) # quitar esto: #S[:,:,k] = np.eye(T, T) #Si[:,:,k] = np.eye(T, T) return S, L, Si
def inv_c(M): A = np.ascontiguousarray(M) L_M, info = lapack.dpotrf(A, lower=1) #L_M = np.linalg.cholesky(M) iM, _ = dpotri(L_M) return iM
def inv_logDet(M): A = np.ascontiguousarray(M) L_M, info = lapack.dpotrf(A, lower=1) iM, _ = dpotri(L_M) logDetM = 2 * sum(np.log(np.diag(L_M))) return iM, logDetM
def inv_chol(L): """ Given that ``L`` is the Cholesky decomposition of A, this method returns A^-1 Note ---- This method is adopted from the GPy package """ Ai, _ = dpotri(np.asfortranarray(L), lower=1) return Ai
def inv_chol(L): """ Given that ``L`` is the Cholesky decomposition of A, this method returns A^-1 Note ---- This method is adopted from the GPy package """ Ai, _ = dpotri(np.asfortranarray(L), lower=1) return Ai
def calculate_mu_var(self, X, Y, Z, q_u_mean, q_u_chol, kern, mean_function, num_inducing, num_data, num_outputs): """ Calculate posterior mean and variance for the latent function values for use in the expectation over the likelihood """ #expand cholesky representation L = choleskies.flat_to_triang(q_u_chol) #S = linalg.ijk_ljk_to_ilk(L, L) #L.dot(L.T) S = np.empty((num_outputs, num_inducing, num_inducing)) [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)] #logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])]) logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])]) #compute mean function stuff if mean_function is not None: prior_mean_u = mean_function.f(Z) prior_mean_f = mean_function.f(X) else: prior_mean_u = np.zeros((num_inducing, num_outputs)) prior_mean_f = np.zeros((num_data, num_outputs)) #compute kernel related stuff Kmm = kern.K(Z) #Knm = kern.K(X, Z) Kmn = kern.K(Z, X) Knn_diag = kern.Kdiag(X) #Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm) Lm = linalg.jitchol(Kmm) logdetKmm = 2.*np.sum(np.log(np.diag(Lm))) Kmmi, _ = linalg.dpotri(Lm) #compute the marginal means and variances of q(f) #A = np.dot(Knm, Kmmi) A, _ = linalg.dpotrs(Lm, Kmn) #mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u) mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u) #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S), 1) v = np.empty((num_data, num_outputs)) for i in range(num_outputs): tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0) v[:,i] = np.sum(np.square(tmp),0) v += (Knn_diag - np.sum(A*Kmn,0))[:,None] #compute the KL term Kmmim = np.dot(Kmmi, q_u_mean) #KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0) KL = KLs.sum() latent_detail = LatentFunctionDetails(q_u_mean=q_u_mean, q_u_chol=q_u_chol, mean_function=mean_function, mu=mu, v=v, prior_mean_u=prior_mean_u, L=L, A=A, S=S, Kmm=Kmm, Kmmi=Kmmi, Kmmim=Kmmim, KL=KL) return latent_detail
def latent_funs_cov(Z, kernel_list): """ Description: Builds the full-covariance cov[u(z),u(z)] of a Multi-output GP for a Sparse approximation :param Z: Inducing Points :param kernel_list: Kernels of u_q functions priors :return: Kuu """ Q = len(kernel_list) M,Dz = Z.shape Xdim = int(Dz/Q) Kuu = np.empty((Q, M, M)) Luu = np.empty((Q, M, M)) Kuui = np.empty((Q, M, M)) for q, kern in enumerate(kernel_list): Kuu[q, :, :] = kern.K(Z[:,q*Xdim:q*Xdim+Xdim],Z[:,q*Xdim:q*Xdim+Xdim]) Luu[q, :, :] = linalg.jitchol(Kuu[q, :, :]) Kuui[q, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[q, :, :])) return Kuu, Luu, Kuui
def woodbury_inv(self): """ The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as $$ (K_{xx} + \Sigma_{xx})^{-1} \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance} $$ """ if self._woodbury_inv is None: if self._woodbury_chol is not None: self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1) symmetrify(self._woodbury_inv) elif self._covariance is not None: B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance) self._woodbury_inv = np.empty_like(B) for i in range(B.shape[-1]): tmp, _ = dpotrs(self.K_chol, B[:, :, i]) self._woodbury_inv[:, :, i], _ = dpotrs(self.K_chol, tmp.T) return self._woodbury_inv
def comp_KL_qU(self, qU_mean ,qU_var): M,D = qU_mean.shape[0], qU_mean.shape[1] qU_L = self.mid['qU_L'] L = self.mid['L'] Linvmu = self.mid['Linvmu'] LinvLu = self.mid['LinvLu'] KuuInv = dpotri(L, lower=1)[0] Lu = qU_L LuInv = dtrtri(Lu) KL = D*M/-2. - np.log(np.diag(Lu)).sum()*D +np.log(np.diag(L)).sum()*D + np.square(LinvLu).sum()/2.*D + np.square(Linvmu).sum()/2. dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] dKL_dqU_var = (tdot(LuInv.T)/-2. + KuuInv/2.)*D dKL_dKuu = KuuInv*D/2. -KuuInv.dot( tdot(qU_mean)+qU_var*D).dot(KuuInv)/2. return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
def comp_KL_qU(self, qU_mean, qU_var): M, D = qU_mean.shape[0], qU_mean.shape[1] qU_L = self.mid['qU_L'] L = self.mid['L'] Linvmu = self.mid['Linvmu'] LinvLu = self.mid['LinvLu'] KuuInv = dpotri(L, lower=1)[0] Lu = qU_L LuInv = dtrtri(Lu) KL = D * M / -2. - np.log(np.diag(Lu)).sum() * D + np.log( np.diag(L)).sum() * D + np.square( LinvLu).sum() / 2. * D + np.square(Linvmu).sum() / 2. dKL_dqU_mean = dtrtrs(L, Linvmu, trans=True)[0] dKL_dqU_var = (tdot(LuInv.T) / -2. + KuuInv / 2.) * D dKL_dKuu = KuuInv * D / 2. - KuuInv.dot(tdot(qU_mean) + qU_var * D).dot(KuuInv) / 2. return float(KL), dKL_dqU_mean, dKL_dqU_var, dKL_dKuu
def latent_funs_cov(Z, kernel_list): """ Builds the full-covariance cov[u(z),u(z)] of a Multi-output GP for a Sparse approximation :param Z: Inducing Points :param kernel_list: Kernels of u_q functions priors :return: Kuu """ Q = len(kernel_list) M, Dz = Z.shape Xdim = int(Dz / Q) #Kuu = np.zeros([Q*M,Q*M]) Kuu = np.empty((Q, M, M)) Luu = np.empty((Q, M, M)) Kuui = np.empty((Q, M, M)) for q, kern in enumerate(kernel_list): Kuu[q, :, :] = kern.K(Z[:, q * Xdim:q * Xdim + Xdim], Z[:, q * Xdim:q * Xdim + Xdim]) Kuu[q, :, :] = Kuu[ q, :, :] #+ 1.0e-6*np.eye(*Kuu[q, :, :].shape) #This line included by Juan for numerical stability Luu[q, :, :] = linalg.jitchol(Kuu[q, :, :], maxtries=10) Kuui[q, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[q, :, :])) return Kuu, Luu, Kuui
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): if variance is None: variance = likelihood.gaussian_variance(Y_metadata) posterior = super(ExactGaussianInferenceIncremental, self).inference(kern, X, likelihood, Y, mean_function, Y_metadata, K, variance, Z_tilde) self._old_LW = posterior[0].woodbury_chol self._K = kern.K(X).copy() self._old_Wi, _ = dpotri(self._old_LW, lower=1) # diag.add(self._K, variance + 1e-8) return posterior
def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(output_dim) + beta beta_exp = np.zeros(indexD.shape[0]) for d in range(output_dim): beta_exp[indexD == d] = beta[d] psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) logL = 0. dL_dthetaL = np.zeros(output_dim) dL_dKmm = np.zeros_like(Kmm) dL_dpsi0 = np.zeros_like(psi0) dL_dpsi1 = np.zeros_like(psi1) dL_dpsi2 = np.zeros_like(psi2) wv = np.empty((Kmm.shape[0], output_dim)) for d in range(output_dim): idx_d = indexD == d Y_d = Y[idx_d] N_d = Y_d.shape[0] beta_d = beta[d] psi2_d = psi2[idx_d].sum(0) * beta_d psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d psi0_d = psi0[idx_d].sum() * beta_d YRY_d = np.square(Y_d).sum() * beta_d LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) b = dtrtrs(LmLL, psi1Y.T)[0].T bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2 logL_R = -N_d * np.log(beta_d) logL += -((N_d * log_2_pi + logL_R + psi0_d - np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2. dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2 dL_dthetaL[d:d + 1] = (YRY_d * beta_d + beta_d * psi0_d - N_d * beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum( ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT) dL_dpsi0[idx_d] = -beta_d / 2. dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v) dL_dpsi2[idx_d] = beta_d * dL_dpsi2R wv[:, d] = v LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) logdet_L = 2. * np.sum(np.log(np.diag(LL))) dL_dpsi2R_common = dpotri(LmLL)[0] / -2. dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None] for d in range(output_dim): dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0) ).sum() * -beta[d] * beta[d] dL_dKmm += dL_dpsi2R_common * output_dim logL += -output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # for d in range(output_dim): # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() #====================================================================== # Compute dL_dpsi #====================================================================== if not uncertain_inputs: dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2. if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } return post, logL, grad_dict
def calculate_gradients(self, q_U, p_U_new, p_U_old, p_U_var, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index,q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # Algebra for p(u) Kuu_new = p_U_new.Kuu.copy() Luu_new = p_U_new.Luu.copy() Kuui_new = p_U_new.Kuui.copy() Kuu_old = p_U_old.Kuu.copy() Luu_old = p_U_old.Luu.copy() Kuui_old = p_U_old.Kuui.copy() Mu_var = p_U_var.Mu.copy() Kuu_var = p_U_var.Kuu.copy() Luu_var = p_U_var.Luu.copy() Kuui_var = p_U_var.Kuui.copy() # KL Terms dKLnew_dmu_q = np.dot(Kuui_new[q,:,:], m_u[:, q, None]) dKLnew_dS_q = 0.5 * (Kuui_new[q,:,:] - S_qi) dKLold_dmu_q = np.dot(Kuui_old[q,:,:], m_u[:, q, None]) dKLold_dS_q = 0.5 * (Kuui_old[q,:,:] - S_qi) dKLvar_dmu_q = np.dot(Kuui_var[q,:,:], (m_u[:, q, None] - Mu_var[q, :, :])) # important!! (Eq. 69 MCB) dKLvar_dS_q = 0.5 * (Kuui_var[q,:,:] - S_qi) dKLnew_dKqq = 0.5 * Kuui_new[q,:,:] - 0.5 * Kuui_new[q,:,:].dot(S_u[q, :, :]).dot(Kuui_new[q,:,:]) \ - 0.5 * np.dot(Kuui_new[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_new[q,:,:].T) dKLold_dKqq = 0.5 * Kuui_old[q,:,:] - 0.5 * Kuui_old[q,:,:].dot(S_u[q, :, :]).dot(Kuui_old[q,:,:]) \ - 0.5 * np.dot(Kuui_old[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_old[q,:,:].T) #dKLvar_dKqq = 0.5 * Kuui_var[q,:,:] - 0.5 * Kuui_var[q,:,:].dot(S_u[q, :, :]).dot(Kuui_var[q,:,:]) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(m_u[:, q, None],m_u[:, q, None].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(m_u[:,q,None], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) \ # + 0.5 * np.dot(Kuui_var[q,:,:], np.dot(Mu_var[q,:,:], m_u[:,q,None].T)).dot(Kuui_var[q,:,:].T) \ # - 0.5 * np.dot(Kuui_var[q,:,:],np.dot(Mu_var[q,:,:], Mu_var[q,:,:].T)).dot(Kuui_var[q,:,:].T) #KLvar += 0.5 * np.sum(Kuui_var[q, :, :] * S_u[q, :, :]) \ # + 0.5 * np.dot((Mu_var[q, :, :] - m_u[:, q, None]).T, np.dot(Kuui_var[q, :, :], (Mu_var[q, :, :] - m_u[:, q, None]))) \ # - 0.5 * M \ # + 0.5 * 2. * np.sum(np.log(np.abs(np.diag(Luu_var[q, :, :])))) \ # - 0.5 * 2. * np.sum(np.log(np.abs(np.diag(L_u[q, :, :])))) # # VE Terms dVE_dmu_q = np.zeros((M, 1)) dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d]])[:, None] Adv = q_fd.Afdu[q,:,:].T * VE_dv[f_index[d]][:,d_index[d],None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui_new[q,:,:]) dVE_dKqq += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:,d_index[d],None]) dVE_dKqq += - np.dot(Adm, np.dot(Kuui_new[q,:,:], m_u[:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui_new[q,:,:]) tmp = 2. * (tmp - np.eye(M)) dve_kqd = np.dot(np.dot(Kuui_new[q,:,:], m_u[:, q, None]), VE_dm[f_index[d]][:,d_index[d],None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:,d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Derivatives of variational parameters dL_dmu_q = dVE_dmu_q - dKLnew_dmu_q + dKLold_dmu_q - dKLvar_dmu_q dL_dS_q = dVE_dS_q - dKLnew_dS_q + dKLold_dS_q - dKLvar_dS_q # Derivatives of prior hyperparameters # if using Zgrad, dL_dKqq = dVE_dKqq - dKLnew_dKqq + dKLold_dKqq - dKLvar_dKqq # otherwise for hyperparameters: dL_dKqq = dVE_dKqq - dKLnew_dKqq dL_dKqq = dVE_dKqq - dKLnew_dKqq #+ dKLold_dKqq - dKLvar_dKqq # dKLold_dKqq sólo para Zgrad, dKLvar_dKqq to be done (for Zgrad) dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:,q:q+1]) dL_dL_q = 2. * np.array([np.dot(a, b) for a, b in zip(dL_dS_q[None,:,:], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = Posterior(mean=m_u[:, q, None], cov=S_u[q, :, :], K=Kuu_new[q,:,:], prior_mean=np.zeros(m_u[:, q, None].shape)) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference(self, n0, C0, P0, log_marginal_likelihood0, log_Det_C0, dn_dR, dC_dR, dψ_dR, dn_dσ02, dC_dσ02, dψ_dσ02, dn_dl, dC_dl, dψ_dl, dn_dσn2, dC_dσn2, dψ_dσn2, X, Y): α = self.α α_const = (1 - α) / α num_data, _ = Y.shape num_inducing = n0.shape[0] # it only works with num_outputs = 1 y = Y[:, 0] # it only works with num_outputs = 1 # update kernel with new hyperparams self.kern.lengthscale = self.params['ls'].copy() self.kern.variance = self.params['σ0']**2 σ_n2 = self.params['σn']**2 Z = self.params['R'] # compute kernel quantities Krr = self.kern.K(Z) # kernel matrix of inducing inputs diag.add(Krr, self.const_jitter) # add some jitter for stability reasons Kxr = self.kern.K( X, Z) # kernel matrix between mini-batch and inducing inputs kxx = self.kern.Kdiag( X ) #+const_jitter # diagonal of kernel matrix auf mini-batch L_K = jitchol(Krr) # lower cholesky matrix of kernel matrix iKrr, _ = dpotri(L_K) # inverse of kernel matrix of inducinv inputs self.Krr = Krr self.iKrr = iKrr # compute state space matrices (and temporary matrices) H = np.dot(Kxr, iKrr) # observation matrix Ht = H.T # transpose of observation matrix d = kxx - np.sum(H * Kxr, 1) # diagonal of correction matrix v = α * d + σ_n2 # diagonal of actual noise matrix a = α_const * (np.sum(np.log(v)) - num_data * np.log(σ_n2) ) # PEP correction term in marignal likelihoo A_ = Ht / v α_ = np.dot(P0, n0) r = y - np.dot(H, α_) # update natural mean and precision + inversion yielding covariance matrix # n1 = ns + np.dot(A_,y) # C1 = Cs + np.dot(A_,H) n1 = n0 + np.dot(A_, y) C1 = C0 + np.dot(A_, H) L_C = jitchol(C1) P1, _ = dpotri(L_C) # more temporary matrices B_ = np.dot(H, P1) # iV * H * Li' # LAPACK? β_ = r / v γ_ = np.dot(B_.T, β_) δ_ = β_ - np.dot(A_.T, γ_) # update marginal log likelihood log_Det_C1 = 2 * sum(np.log(np.diag(L_C))) log_Ddet_V = sum(np.log(v)) Δ0 = num_data * np.log( 2 * np.pi) + log_Det_C1 - log_Det_C0 + log_Ddet_V + np.sum( r * δ_) + a log_marginal_likelihood1 = log_marginal_likelihood0 - 0.5 * Δ0 # print('lik_i '+str(0.5*Δ0)) # compute constant derivatives of likelihood wrt kernel matrices dL_dH = 2 * ((B_.T / v).T - np.outer(δ_, α_ + γ_)) dL_dv = -(np.sum(H * B_, 1) - v / α + (r - np.dot(H, γ_))**2) / (v**2) D_ = α * (Ht * dL_dv).T E_ = np.dot(dL_dH, iKrr) dL_dKxr = E_ - 2 * D_ dL_dKrr = -np.dot(Ht, E_ - D_) dL_dkxx = α * dL_dv dL_dn = -2 * np.dot(P0, np.dot(Ht, δ_)) dL_dC = P1 - P0 - np.outer(dL_dn, α_) + np.outer(γ_, γ_) # dL_d_dn = 2*σ_n2 *sum(dL_dv) -2*num_data*α_const # wrt to dn dL_d_dn = sum(dL_dv) - num_data * α_const / σ_n2 # wrt to σn2 iVy = y / v dH = np.zeros((num_data, num_inducing)) scaleFact = 1 ### if self.params_EST['R']: # compute sparse kernel derivatives # dKrr_sparse = np.zeros((J,J,D)) dKrr_sparse = self.kern.dK_dX(Z) #, dK_dR=dKrr_sparse) # dKxr_sparse = np.zeros((B,J,D)) dKxr_sparse = self.kern.dK_dX(X, Z) #, dK_dR=dKxr_sparse) # loop over all inducing points for j in range(0, num_inducing): for d in range(0, self.D): jd = j * self.D + d kjd = dKrr_sparse[:, j, d] k2jd = dKxr_sparse[:, j, d] #dψ_dR[j,d] = dψ_dR[j,d] -0.5*( np.sum(dL_dKrr[:,j]*kjd) + np.sum(dL_dKrr[j,:]*kjd) + np.sum(dL_dKxr[:,j]*k2jd) + np.sum( dL_dn*dn_dR[:,jd]) + np.sum( dL_dC*dC_dR[:,:,jd]) ) ### dψ_dR[j,d] = dψ_dR[j,d] -0.5*( np.sum(dL_dkxx *dKxx_diag) + dL_d_dn ) delta = -0.5 * (np.sum(dL_dKrr[:, j] * kjd) + np.sum( dL_dKrr[j, :] * kjd) + np.sum(dL_dKxr[:, j] * k2jd) + np.sum(dL_dn * dn_dR[:, jd]) + np.sum(dL_dC * dC_dR[:, :, jd])) dψ_dR[j, d] = delta * scaleFact dH = -np.outer(H[:, j], kjd) dH[:, j] += -np.dot(H, kjd) + k2jd dH = np.dot(dH, iKrr) dd = -np.sum(dH * Kxr, 1 ) - H[:, j] * k2jd #### dKxx_diag for theta!! div = -α * dd / (v**2) dn_dR[:, jd] = dn_dR[:, jd] + np.dot(dH.T, iVy) + np.dot( Ht, div * y) F_ = np.dot(A_, dH) dC_dR[:, :, jd] = dC_dR[:, :, jd] + F_ + F_.T + np.dot( Ht * div, H) # compute kernel derivatives wrt variance_0 dKrr_dσ02 = self.kern.dK_dσ02(Z) dKxr_dσ02 = self.kern.dK_dσ02(X, Z) dkxx_dσ02 = self.kern.dK_dσ02_diag(X) # dψ_dσ02 = dψ_dσ02 - 0.5*( np.sum(dL_dKrr*dKrr_dσ02) + np.sum(dL_dKxr*dKxr_dσ02) + np.sum( dL_dn*dn_dσ02) + np.sum( dL_dC*dC_dσ02) ) # dψ_dσ02 = dψ_dσ02 - 0.5* np.sum(dL_dkxx *dkxx_dσ02) delta = -0.5 * (np.sum(dL_dKrr * dKrr_dσ02) + np.sum(dL_dKxr * dKxr_dσ02) + np.sum(dL_dn * dn_dσ02) + np.sum(dL_dC * dC_dσ02)) delta = delta - 0.5 * np.sum(dL_dkxx * dkxx_dσ02) dψ_dσ02 = delta * scaleFact dH = dKxr_dσ02 - np.dot(H, dKrr_dσ02) dH = np.dot(dH, iKrr) dd = dkxx_dσ02 - np.sum(dH * Kxr, 1) - np.sum(H * dKxr_dσ02, 1) div = -α * dd / (v**2) dn_dσ02 = dn_dσ02 + np.dot(dH.T, iVy) + np.dot(Ht, div * y) F_ = np.dot(A_, dH) dC_dσ02 = dC_dσ02 + F_ + F_.T + np.dot(Ht * div, H) # compute kernel derivatives wrt lengthsacle(s) dKrr_dl = self.kern.dK_dl(Z) dKxr_dl = self.kern.dK_dl(X, Z) # dkxx_dl = kern.dK_dl_diag(X) # zero anyway # loop over all lengthscales num_lengthscales = dKrr_dl.shape[2] for d in range(0, num_lengthscales): delta = -0.5 * (np.sum(dL_dKrr * dKrr_dl[:, :, d]) + np.sum( dL_dKxr * dKxr_dl[:, :, d]) + np.sum(dL_dn * dn_dl[:, d]) + np.sum(dL_dC * dC_dl[:, :, d])) ############################# dψ_dl[d] = delta * scaleFact dH = dKxr_dl[:, :, d] - np.dot(H, dKrr_dl[:, :, d]) dH = np.dot(dH, iKrr) dd = -np.sum(dH * Kxr, 1) - np.sum(H * dKxr_dl[:, :, d], 1) div = -α * dd / (v**2) dn_dl[:, d] = dn_dl[:, d] + np.dot(dH.T, iVy) + np.dot(Ht, div * y) F_ = np.dot(A_, dH) dC_dl[:, :, d] = dC_dl[:, :, d] + F_ + F_.T + np.dot(Ht * div, H) # gaussian noise variance delta = -0.5 * (np.sum(dL_dn * dn_dσn2) + np.sum(dL_dC * dC_dσn2) + dL_d_dn) # dψ_dσn2 = dψ_dσn2 dψ_dσn2 = delta * scaleFact div = -1.0 / (v**2) dn_dσn2 = dn_dσn2 + np.dot(Ht, div * y) dC_dσn2 = dC_dσn2 + np.dot(Ht * div, H) m1 = np.dot(P1, n1) return log_marginal_likelihood1, n1, m1, C1, P1, log_Det_C1, dn_dR, dC_dR, dψ_dR, dn_dσ02, dC_dσ02, dψ_dσ02, dn_dl, dC_dl, dψ_dl, dn_dσn2, dC_dσn2, dψ_dσn2
def maximization(self, Y, K, C, t, parameters, hyperparameters, expectations): self.N = Y.shape[0] self.T = Y.shape[1] # Model parameters pi = parameters[0].copy() f = parameters[1].copy() mu = parameters[2].copy() # Model hyperparameters ls = hyperparameters[0].copy() a0 = hyperparameters[1].copy() a = hyperparameters[2].copy() b = hyperparameters[3].copy() sigmas = hyperparameters[4].copy() var_precision = sigmas.shape[0] # Expected values r_ik = expectations['r_ik'] #c_ik = expectations['c_ik'] Y_exp = expectations['Y_exp'] matrices = expectations['matrices'] # old building of matrices Sold = matrices['S_old'] Lold = matrices['L_old'] Siold = matrices['Si_old'] # new building of matrices hyperparam_list = [ls, a0, a, b, sigmas] S, L, Si = util.build_covariance(t, K, hyperparam_list) #dims: (T,T,K) # Identifiying missing (NaN) values nans = np.isnan(Y[:,:,0]) notnans = np.invert(nans) # Expected Log-Likelihood (Cost Function) log_likelihood = 0.0 het_logpdf = np.empty((self.N, K)) # Log-likelihood derivatives wrt hyperparameters dL_dl = np.zeros((1, K)) dL_da0 = np.zeros((1, K)) dL_da = np.zeros((C, K)) dL_db = np.zeros((C, K)) dL_dsigmas = np.zeros((var_precision, 1)) c_ik = np.empty((self.N, K)) for k in range(K): S_k = S[:, :, k] # new Si_k = Si[:, :, k] # new Sold_k = Sold[:, :, k] # old Siold_k = Siold[:, :, k] # old Y_exp_k = Y_exp[k] Y_exp_real = Y_exp_k[:, :, 0] Y_exp_bin = Y_exp_k[:, :, 1] detS_k = np.linalg.det(S_k) for i in range(self.N): Sold_k_oo = Sold_k[np.ix_(notnans[i,:], notnans[i,:])] Sold_k_mm = Sold_k[np.ix_(nans[i,:], nans[i,:])] Sold_k_mo = Sold_k[np.ix_(nans[i,:], notnans[i,:])] Sold_k_om = Sold_k_mo.T Si_k_mm = Si_k[np.ix_(nans[i,:], nans[i,:])] # mm submatrix of Si_k Lold_k_oo = linalg.jitchol(Sold_k_oo) iSold_k_oo, _ = linalg.dpotri(np.asfortranarray(Lold_k_oo)) # inverse of oo submatrix Cov_m = Sold_k_mm - (Sold_k_mo.dot(iSold_k_oo).dot(Sold_k_om)) c_ik[i,k] = np.trace(Si_k_mm.dot(Cov_m)) A_m = np.zeros((self.T, self.T)) A_m[np.ix_(nans[i, :], nans[i, :])] = Cov_m y = Y_exp_real[i, :].T y = y[:, np.newaxis] yy_T = np.dot(y,y.T) aa_T = Si_k.dot(yy_T).dot(Si_k.T) Q1 = aa_T - Si_k Q2 = Si_k.dot(A_m).dot(Si_k) dK_dl, dK_da0, dK_da, dK_db, dK_dsigmas = self.kernel_gradients(Q1, Q2, t, k, C, hyperparam_list) dL_dl[0,k] += 0.5*r_ik[i,k]*dK_dl dL_da0[0, k] += 0.5*r_ik[i,k]*dK_da0 dL_da[:,k] += 0.5*r_ik[i,k]*dK_da.flatten() dL_db[:,k] += 0.5*r_ik[i,k]*dK_db.flatten() dL_dsigmas += 0.5*r_ik[i,k]*dK_dsigmas log_likelihood += - 0.5*r_ik[i,k]*np.log(pi[0,k]) - 0.5*r_ik[i,k]*np.log(detS_k) \ - 0.5*r_ik[i,k] * np.dot(Y_exp_real[i,:],Si_k).dot(Y_exp_real[i,:].T) \ - 0.5*r_ik[i,k]*c_ik[i,k] \ + r_ik[i,k]*np.sum(Y_exp_bin[i,:]*np.log(mu[:, k])) \ + r_ik[i,k]*np.sum(Y_exp_bin[i,:]*np.log(1 - mu[:, k])) # + r_ik[i,k]*[] # falta el pi de la gaussian #param_list = [f[:, k], S[:, :, k], Si[:, :, k], mu[:, k]] gradients = {'dL_dl':dL_dl, 'dL_da0':dL_da0, 'dL_da':dL_da, 'dL_db':dL_db, 'dL_dsigmas':dL_dsigmas} return log_likelihood, gradients
def reset_epoch(self): # update kernel with new hyperparams self.kern.lengthscale = self.params['ls'].copy() self.kern.variance = self.params['σ0']**2 σ_n2 = self.params['σn']**2 Z = self.params['R'] # initialize all prior quantities self.n = np.zeros( self.num_inducing) # natural mean vector (num_output = 1!) self.P = self.kern.K(Z) # covariance matrix diag.add(self.P, self.const_jitter) L_P = jitchol(self.P) self.C, _ = dpotri(L_P, lower=1) # precision matrix self._log_marginal_likelihood = 0.0 # log marginal likelihood self._log_Det_C = -2 * sum(np.log( np.diag(L_P))) # log determinant of C self.Krr = self.P self.iKrr = self.C # derivative quantities J = self.num_inducing # number of inducing points JD = self.num_inducing * self.kern.input_dim # number of inducing points times dimension if self.params_EST['R']: self.dn_dR = np.zeros( (J, JD) ) # derivative of natural mean wrt inducing inputs (Rjd: R11,...,R1D, R21,...,RJD) self.dC_dR = np.zeros( (J, J, JD) ) # derivative of precision matrix wrt inducing inputs (Rjd: R11,...,R1D, R21,...,RJD) self.dψ_dR = np.zeros( (J, self.kern.input_dim)) # gradients of inducing inputs dKrr_sparse = self.kern.dK_dX(Z) for j in range(0, self.num_inducing): for d in range(0, self.kern.input_dim): jd = j * self.kern.input_dim + d self.dC_dR[:, :, jd] = -np.outer( np.dot(self.C, dKrr_sparse[:, j, d]), self.C[:, j]) self.dC_dR[:, :, jd] = self.dC_dR[:, :, jd] + self.dC_dR[:, :, jd].T else: self.dψ_dR = 0.0 self.dn_dR = 0.0 self.dC_dR = 0.0 dKrr_dσ02 = self.kern.dK_dσ02(Z) self.dn_dσ02 = np.zeros(J) self.dC_dσ02 = -np.dot(np.dot(self.C, dKrr_dσ02), self.C) self.dψ_dσ02 = 0.0 dKrr_dl = self.kern.dK_dl(Z) num_lengthscales = dKrr_dl.shape[2] self.dn_dl = np.zeros((J, num_lengthscales)) self.dC_dl = np.zeros((J, J, num_lengthscales)) self.dψ_dl = np.zeros(num_lengthscales) for d in range(0, num_lengthscales): self.dC_dl[:, :, d] = -np.dot(np.dot(self.C, dKrr_dl[:, :, d]), self.C) self.dn_dσn2 = np.zeros(J) self.dC_dσn2 = np.zeros((J, J)) self.dψ_dσn2 = 0.0
def calculate_gradients(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, j): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) S_u = np.dot( L_u[j, :, :], L_u[j, :, :].T ) #This could be done outside and recieve it to reduce computation #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[j, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_j = np.dot(Kuui[j, :, :], m_u[:, j, None]) dKL_dS_j = 0.5 * (Kuui[j, :, :] - S_qi) dKL_dKjj = 0.5 * Kuui[j,:,:] - 0.5 * Kuui[j,:,:].dot(S_u).dot(Kuui[j,:,:]) \ - 0.5 * np.dot(Kuui[j,:,:],np.dot(m_u[:, j, None],m_u[:, j, None].T)).dot(Kuui[j,:,:].T) # VE Terms dVE_dmu_j = np.zeros((M, 1)) dVE_dS_j = np.zeros((M, M)) dVE_dKjj = np.zeros((M, M)) dVE_dKjd = [] dVE_dKdiag = [] Nt = Ntask[f_index[j]] dVE_dmu_j += np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j]])[:, None] Adv = q_F[j].Afdu.T * VE_dv[f_index[j]][:, d_index[j], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_F[j].Afdu).reshape(M, M) dVE_dS_j += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u).dot(Kuui[j, :, :]) dVE_dKjj += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_F[j].Afdu.T, VE_dm[f_index[j]][:, d_index[j], None]) dVE_dKjj += -np.dot(Adm, np.dot(Kuui[j, :, :], m_u[:, j, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u, Kuui[j, :, :]) tmp = 2. * (tmp - np.eye(M)) dve_kjd = np.dot(np.dot(Kuui[j, :, :], m_u[:, j, None]), VE_dm[f_index[j]][:, d_index[j], None].T) dve_kjd += np.dot(tmp.T, Adv) dVE_dKjd.append(dve_kjd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[j]][:, d_index[j]]) dVE_dKjj = 0.5 * (dVE_dKjj + dVE_dKjj.T) # Sum of VE and KL terms dL_dmu_j = dVE_dmu_j - dKL_dmu_j dL_dS_j = dVE_dS_j - dKL_dS_j dL_dKjj = dVE_dKjj - dKL_dKjj dL_dKdj = dVE_dKjd[0].copy() #Here we just pass the unique position dL_dKdiag = dVE_dKdiag[0].copy( ) #Here we just pass the unique position # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_j = choleskies.flat_to_triang(chol_u[:, j:j + 1]) dL_dL_j = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_j[None, :, :], L_j)]) dL_dL_j = choleskies.triang_to_flat(dL_dL_j) # Posterior posterior_j = Posterior(mean=m_u[:, j, None], cov=S_u, K=Kuu[j, :, :], prior_mean=np.zeros(m_u[:, j, None].shape)) return dL_dmu_j, dL_dL_j, dL_dS_j, posterior_j, dL_dKjj, dL_dKdj, dL_dKdiag
def elbo_derivatives(self, q_U, p_U, q_F, VE_dm, VE_dv, Ntask, dims, f_index, d_index, q): """ Description: Returns ELBO derivatives w.r.t. variational parameters and hyperparameters Equation: gradients = {dL/dmu, dL/dS, dL/dKmm, dL/Kmn, dL/dKdiag} Paper: In Appendix 4 and 5 Extra_Info: Gradients w.r.t. hyperparameters use chain-rule and GPy. Note that Kmm, Kmn, Kdiag are matrices """ Q = dims['Q'] M = dims['M'] #------------------------------------# ALGEBRA FOR DERIVATIVES #------------------------------------# ####### Algebra for q(u) and p(u) ####### m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(L_u[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") #-------------------------------------# DERIVATIVES OF ELBO TERMS #----------------------------------# ####### KL Terms ####### dKL_dmu_q = np.dot(Kuui[q, :, :], m_u[:, q, None]) dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) dKL_dKqq = 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \ - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[:, q, None], m_u[:, q, None].T)).dot(Kuui[q, :, :].T) ####### Variational Expectation (VE) Terms ####### dVE_dmu_q = np.zeros((M, 1)) dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q += np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d]])[:, None] Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA ####### Derivatives dKuquq ####### tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :]) dVE_dKqq += AdvA - tmp_dv - tmp_dv.T Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d], None]) dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[:, q, None]).T) ####### Derivatives dKuqfd ####### tmp = np.dot(S_u[q, :, :], Kuui[q, :, :]) tmp = 2. * (tmp - np.eye(M)) dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[:, q, None]), VE_dm[f_index[d]][:, d_index[d], None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) ####### Derivatives dKdiag ####### dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) #--------------------------------------# FINAL ELBO DERIVATIVES #------------------------------------# ####### ELBO derivatives ---> sum of VE and KL terms ####### dL_dmu_q = dVE_dmu_q - dKL_dmu_q dL_dS_q = dVE_dS_q - dKL_dS_q dL_dKqq = dVE_dKqq - dKL_dKqq dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag ####### Pass S_q gradients to its low-triangular representation L_q ####### chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1]) dL_dL_q = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) return dL_dmu_q, dL_dL_q, dL_dS_q, dL_dKqq, dL_dKdq, dL_dKdiag
def calculate_gradients(self, q_U, S_u, Su_add_Kuu_chol, p_U, q_F, VE_dm, VE_dv, Ntask, M, Q, D, f_index, d_index, q): """ Calculates gradients of the Log-marginal distribution p(Y) wrt variational parameters mu_q, S_q """ # Algebra for q(u) and p(u): m_u = q_U.mu_u.copy() #L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) #S_u = np.empty((Q, M, M)) #[np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() S_qi, _ = linalg.dpotri(np.asfortranarray(Su_add_Kuu_chol[q, :, :])) if np.any(np.isinf(S_qi)): raise ValueError("Sqi: Cholesky representation unstable") # KL Terms dKL_dmu_q = [] dKL_dKqq = 0 for d in range(D): dKL_dmu_q.append(np.dot(Kuui[q, :, :], m_u[d][:, q, None])) #same dKL_dKqq += -0.5 * S_qi + 0.5 * Kuui[q, :, :] - 0.5 * Kuui[q, :, :].dot(S_u[q, :, :]).dot(Kuui[q, :, :]) \ - 0.5 * np.dot(Kuui[q, :, :], np.dot(m_u[d][:, q, None], m_u[d][:, q, None].T)).dot(Kuui[q, :, :].T) # same #dKL_dS_q = 0.5 * (Kuui[q,:,:] - S_qi) #old dKL_dS_q = 0.5 * (Kuui[q, :, :] - S_qi) * D # VE Terms #dVE_dmu_q = np.zeros((M, 1)) dVE_dmu_q = [] dVE_dS_q = np.zeros((M, M)) dVE_dKqq = np.zeros((M, M)) dVE_dKqd = [] dVE_dKdiag = [] dL_dmu_q = [] for d, q_fd in enumerate(q_F): Nt = Ntask[f_index[d]] dVE_dmu_q.append( np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d]])[:, None]) dL_dmu_q.append(dVE_dmu_q[d] - dKL_dmu_q[d]) Adv = q_fd.Afdu[q, :, :].T * VE_dv[f_index[d]][:, d_index[d], None].T Adv = np.ascontiguousarray(Adv) AdvA = np.dot(Adv.reshape(-1, Nt), q_fd.Afdu[q, :, :]).reshape(M, M) dVE_dS_q += AdvA # Derivatives dKuquq tmp_dv = np.dot(AdvA, S_u[q, :, :]).dot(Kuui[q, :, :]) dVE_dKqq += -tmp_dv - tmp_dv.T #+ AdvA last term not included in the derivative Adm = np.dot(q_fd.Afdu[q, :, :].T, VE_dm[f_index[d]][:, d_index[d], None]) dVE_dKqq += -np.dot(Adm, np.dot(Kuui[q, :, :], m_u[d][:, q, None]).T) # Derivatives dKuqfd tmp = np.dot(S_u[q, :, :], Kuui[q, :, :]) tmp = 2. * tmp #2. * (tmp - np.eye(M)) # the term -2Adv not included dve_kqd = np.dot(np.dot(Kuui[q, :, :], m_u[d][:, q, None]), VE_dm[f_index[d]][:, d_index[d], None].T) dve_kqd += np.dot(tmp.T, Adv) dVE_dKqd.append(dve_kqd) # Derivatives dKdiag dVE_dKdiag.append(VE_dv[f_index[d]][:, d_index[d]]) dVE_dKqq = 0.5 * (dVE_dKqq + dVE_dKqq.T) # Sum of VE and KL terms #dL_dmu_q = dVE_dmu_q - dKL_dmu_q dL_dS_q = dVE_dS_q - dKL_dS_q dL_dKqq = dVE_dKqq - dKL_dKqq dL_dKdq = dVE_dKqd dL_dKdiag = dVE_dKdiag # Pass S_q gradients to its low-triangular representation L_q chol_u = q_U.chols_u.copy() L_q = choleskies.flat_to_triang(chol_u[:, q:q + 1]) dL_dL_q = 2. * np.array( [np.dot(a, b) for a, b in zip(dL_dS_q[None, :, :], L_q)]) dL_dL_q = choleskies.triang_to_flat(dL_dL_q) # Posterior posterior_q = [] for d in range(D): posterior_q.append( Posterior(mean=m_u[d][:, q, None], cov=S_u[q, :, :] + Kuu[q, :, :], K=Kuu[q, :, :], prior_mean=np.zeros(m_u[d][:, q, None].shape))) return dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag
def expectation(self, Y, K, C, t, pi, parameters, hyperparameters): self.N = Y.shape[0] self.T = Y.shape[1] # Model parameters pi = parameters[0].copy() f = parameters[1].copy() mu = parameters[2].copy() # Model hyperparameters ls = hyperparameters[0].copy() a0 = hyperparameters[1].copy() a = hyperparameters[2].copy() b = hyperparameters[3].copy() sigmas = hyperparameters[4].copy() # Missing values Yreal = Y[:, :, 0] Ybin = Y[:, :, 1] nans = np.isnan(Yreal) notnans = np.invert(nans) # Covariance hyperparam_list = [ls, a0, a, b, sigmas] S, L, Si = util.build_covariance(t, K, hyperparam_list) #dims: (T,T,K) matrices = {'S_old': S, 'L_old': L, 'Si_old': Si} # Posterior of latent classes r_ik = np.empty((self.N, K)) # Expectations on latent variables for k in range(K): #param_list = [f[:,k], S[:,:,k], Si[:,:,k], mu[:,k]] S_k = S[:, :, k] Si_k = Si[:, :, k] mu_k = mu[:, k] detS_k = np.linalg.det(S_k) #r_ik[:, k] = pi[0, k] * util.heterogeneous_pdf(Y, param_list) for i in range(self.N): r_ik[i,k] = pi[0,k]*(1/np.sqrt(detS_k * np.pi**self.T)) \ *np.exp(-0.5*np.dot(Yreal[i,np.ix_(notnans[i,:])], Si_k[np.ix_(notnans[i,:],notnans[i,:])]).dot(Yreal[i,np.ix_(notnans[i,:])].T)) \ *np.prod((mu_k[np.ix_(notnans[i,:])])**Ybin[i,np.ix_(notnans[i,:])] * (1 - mu_k[np.ix_(notnans[i,:])])**Ybin[i,np.ix_(notnans[i,:])]) r_ik = r_ik / np.tile(r_ik.sum(1)[:, np.newaxis], (1, K)) # Expectations on missing values c_ik = np.empty((self.N, K)) Y_expectation = [] for k in range(K): # Real observations Yreal_fill = Yreal.copy() S_k = S[:, :, k] Si_k = Si[:, :, k] for i in range(self.N): S_k_oo = S_k[np.ix_(notnans[i, :], notnans[i, :])] S_k_mm = S_k[np.ix_(nans[i, :], nans[i, :])] S_k_mo = S_k[np.ix_(nans[i, :], notnans[i, :])] S_k_om = S_k_mo.T Si_k_mm = Si_k[np.ix_(nans[i, :], nans[i, :])] # mm submatrix of Si_k L_k_oo = linalg.jitchol(S_k_oo) iS_k_oo, _ = linalg.dpotri( np.asfortranarray(L_k_oo)) # inverse of oo submatrix Cov_m = S_k_mm - (S_k_mo.dot(iS_k_oo).dot(S_k_om)) c_ik[i, k] = np.trace(Si_k_mm.dot(Cov_m)) Yreal_fill[i, nans[i, :]] = S_k_mo.dot(iS_k_oo).dot( Yreal[i, notnans[i, :]]) # Binary observations Ybin_fill = Ybin.copy() mu_matrix = np.tile(mu[:, k].T + 0.0, (self.N, 1)) Ybin_fill[nans] = mu_matrix[nans] # Missings observation are now filled Y_fill_k = np.empty((self.N, self.T, 2)) Y_fill_k[:, :, 0] = Yreal_fill Y_fill_k[:, :, 1] = Ybin_fill Y_expectation.append(Y_fill_k) return r_ik, c_ik, Y_expectation, matrices