def build_covariance(t, K, hyperparams): ls = hyperparams[0] a0 = hyperparams[1] a = hyperparams[2] b = hyperparams[3] C, _ = a.shape # number of Fourier coefficients T, _ = t.shape S = np.empty((T, T, K)) L = np.empty((T, T, K)) Si = np.empty((T, T, K)) Diag, _ = build_diagonal(t, hyperparams) for k in range(K): # falta meter el término periódic hyperparam_k_list = [ls[0, k], a0[0, k], a[:, k], b[:, k]] per_term = fourier_series(t, T, C, hyperparam_k_list) s = per_term**2 per_S = s * s.T E = periodic_exponential(t, T, hyperparam_k_list) S[:, :, k] = per_S * E S[:, :, k] += Diag L[:, :, k] = linalg.jitchol(S[:, :, k]) Si[:, :, k], _ = linalg.dpotri(np.asfortranarray(L[:, :, k])) # quitar esto: #S[:,:,k] = np.eye(T, T) #Si[:,:,k] = np.eye(T, T) return S, L, Si
def K_chol(self): """ Cholesky of the prior covariance K """ if self._K_chol is None: self._K_chol = jitchol(self.K) return self._K_chol
def do_computations(self): """ Here we do all the computations that are required whenever the kernels or the variational parameters are changed. """ # sufficient stats. self.ybark = np.dot(self.phi.T, self.Y).T # compute posterior variances of each cluster (lambda_inv) tmp = backsub_both_sides(self.Sy_chol, self.Sf, transpose="right") self.Cs = [np.eye(self.D) + tmp * phi_hat_i for phi_hat_i in self.phi_hat] self._C_chols = [jitchol(C) for C in self.Cs] self.log_det_diff = np.array([2.0 * np.sum(np.log(np.diag(L))) for L in self._C_chols]) tmp = [dtrtrs(L, self.Sy_chol.T, lower=1)[0] for L in self._C_chols] self.Lambda_inv = np.array( [ (self.Sy - np.dot(tmp_i.T, tmp_i)) / phi_hat_i if (phi_hat_i > 1e-6) else self.Sf for phi_hat_i, tmp_i in zip(self.phi_hat, tmp) ] ) # posterior mean and other useful quantities self.Syi_ybark, _ = dpotrs(self.Sy_chol, self.ybark, lower=1) self.Syi_ybarkybarkT_Syi = self.Syi_ybark.T[:, None, :] * self.Syi_ybark.T[:, :, None] self.muk = (self.Lambda_inv * self.Syi_ybark.T[:, :, None]).sum(1).T
def parameters_changed(self): N, D = self.Y.shape Kss = self.kern.K(self.X) Ksu = self.kern.K(self.X, self.Z) wv = self.posterior.woodbury_vector wi = self.posterior.woodbury_inv a = self.Y - Ksu.dot(wv) C = Kss + np.eye(N)*self.likelihood.variance - Ksu.dot(wi).dot(Ksu.T) Lc = jitchol(C) LcInva = dtrtrs(Lc, a)[0] LcInv = dtrtri(Lc) CInva = dtrtrs(Lc, LcInva,trans=1)[0] self._log_marginal_likelihood = -N*D/2.*np.log(2*np.pi) - D*np.log(np.diag(Lc)).sum() - np.square(LcInva).sum()/2. dKsu = CInva.dot(wv.T) dKss = tdot(CInva)/2. -D* tdot(LcInv.T)/2. dKsu += -2. * dKss.dot(Ksu).dot(wi) X_grad = self.kern.gradients_X(dKss, self.X) X_grad += self.kern.gradients_X(dKsu, self.X, self.Z) self.X.gradient = X_grad if self.uncertain_input: # Update Log-likelihood KL_div = self.variational_prior.KL_divergence(self.X) # update for the KL divergence self.variational_prior.update_gradients_KL(self.X) self._log_marginal_likelihood += -KL_div
def latent_function_covKuu(Z, B, kernel_list, kernel_list_Gdj, kff_aux): """ Builds the cross-covariance Kudud= cov[u_d(x),u_d(x)] of a Convolved Multi-output GP :param Z: Inducing points :param B: Coregionalization matrix :param kernel_list: Kernels of u_q functions :param kernel_list_Gdj: Kernel smoothing functions G(x) :param kff_aux is the kernel that solves the convolution integral between G(x) and kern_uq :return: Kuu """ J = len(kernel_list_Gdj) M, Dz = Z.shape Xdim = int(Dz / J) # Kuu = np.zeros([Q*M,Q*M]) Kuu = np.zeros((J, M, M)) Luu = np.empty((J, M, M)) Kuui = np.empty((J, M, M)) for j in range(J): for q, B_q in enumerate(B): update_conv_Kff(kernel_list[q], kernel_list_Gdj[j], kff_aux) Kuu[j, :, :] += B_q.B[j, j] * kff_aux.K( Z[:, j * Xdim:j * Xdim + Xdim], Z[:, j * Xdim:j * Xdim + Xdim]) Luu[j, :, :] = linalg.jitchol(Kuu[j, :, :], maxtries=10) Kuui[j, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[j, :, :])) return Kuu, Luu, Kuui
def do_computations(self): """ Here we do all the computations that are required whenever the kernels or the variational parameters are changed. """ #sufficient stats. self.ybark = np.dot(self.phi.T, self.Y).T # compute posterior variances of each cluster (lambda_inv) tmp = backsub_both_sides(self.Sy_chol, self.Sf, transpose='right') self.Cs = [ np.eye(self.D) + tmp * phi_hat_i for phi_hat_i in self.phi_hat ] self._C_chols = [jitchol(C) for C in self.Cs] self.log_det_diff = np.array( [2. * np.sum(np.log(np.diag(L))) for L in self._C_chols]) tmp = [dtrtrs(L, self.Sy_chol.T, lower=1)[0] for L in self._C_chols] self.Lambda_inv = np.array([ (self.Sy - np.dot(tmp_i.T, tmp_i)) / phi_hat_i if (phi_hat_i > 1e-6) else self.Sf for phi_hat_i, tmp_i in zip(self.phi_hat, tmp) ]) #posterior mean and other useful quantities self.Syi_ybark, _ = dpotrs(self.Sy_chol, self.ybark, lower=1) self.Syi_ybarkybarkT_Syi = self.Syi_ybark.T[:, None, :] * self.Syi_ybark.T[:, :, None] self.muk = (self.Lambda_inv * self.Syi_ybark.T[:, :, None]).sum(1).T
def update_posterior( K: np.ndarray, v: np.ndarray, tau: np.ndarray, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], jitter: float = 1e-9, get_logger: Callable = None, ) -> posteriorParams: """ Update the posterior approximation. See e.g. 3.59 in http://www.gaussianprocess.org/gpml/chapters/RW.pdf :param K: prior covariance matrix :param v: Scale of the Gaussian approximation :param tau: Precision of the Gaussian approximation :param y: Observations indicating where we have a diagonal element :param yc: Comparisons indicating where we have a block diagonal element :param jitter: small number added to the diagonal to increase robustness. :param get_logger: Function for receiving the legger where the prints are forwarded. :return: posterior approximation """ D = K.shape[0] sqrt_tau = sqrtm_block(tau + np.diag(jitter * np.ones((D))), y, yc) G = np.dot(sqrt_tau, K) B = np.identity(D) + np.dot(G, sqrt_tau) L = jitchol(B) V = np.linalg.solve(L, G) Sigma_full = K - np.dot(V.T, V) mu = np.dot(Sigma_full, v) Sigma = np.diag(Sigma_full) return posteriorParams(mu=mu, Sigma=Sigma_full, L=L)
def natural_grad_qu(model, n_iter=1, step_size=step_rate, momentum=0.0): global mk_ant, mk_aux, mk, V_i, Vk, Lk, Vk, Vki_ant """"Initialize the step-sizes""" "" beta2_k = step_size #use step_size*0.1 for Convolutional MOGP gamma2_k = momentum alpha2_k = step_size N_posteriors = model.q_u_means.shape[1] if n_iter == 1: V_i = choleskies.multiple_dpotri( choleskies.flat_to_triang(model.q_u_chols.values)).copy() Vk = np.zeros_like(V_i) for i in range(N_posteriors): Vk[i, :, :] = 0.5 * (model.posteriors[i].covariance.copy() + model.posteriors[i].covariance.T.copy()) Lk = np.zeros_like(Vk) mk = model.q_u_means.values.copy() Vki_ant = V_i.copy() mk_aux = mk.copy() dL_dm, dL_dV = compute_stoch_grads_for_qu_HetMOGP(model=model) mk_ant = mk_aux.copy() mk_aux = mk.copy() if not model.q_u_means.is_fixed and not model.q_u_chols.is_fixed: mk_ant = mk_aux.copy() mk_aux = mk.copy() for i in range(N_posteriors): try: V_i[i, :, :] = V_i[i, :, :] + 2 * beta2_k * dL_dV[ i] #+ 1.0e-6*np.eye(*Vk[i,:,:].shape) Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 0.5 * (np.array(Vk[i, :, :]) + np.array(Vk[i, :, :].T)) Lk[i, :, :] = np.linalg.cholesky(Vk[i, :, :]) mk[:, i] = mk[:, i] - alpha2_k * np.dot( Vk[i, :, :], dL_dm[i]) + gamma2_k * np.dot( np.dot(Vk[i, :, :], Vki_ant[i, :, :]), (mk[:, i] - mk_ant[:, i])) except LinAlgError: print("Overflow") Vk[i, :, :] = np.linalg.inv(V_i[i, :, :]) Vk[i, :, :] = 1.0e-1 * np.eye( *Vk[i, :, :].shape ) #nearestPD(Vk[i,:,:]) # + 1.0e-3*np.eye(*Vk[i,:,:].shape) Lk[i, :, :] = linalg.jitchol(Vk[i, :, :]) V_i[i, :, :] = np.linalg.inv(Vk[i, :, :]) mk[:, i] = mk[:, i] * 0.0 Vki_ant = V_i.copy() model.L_u.setfield(choleskies.triang_to_flat(Lk.copy()), np.float64) model.m_u.setfield(mk.copy(), np.float64)
def compute_covariance(x: np.ndarray, kernel: RBF) -> tuple: assert x.ndim <= 2 if x.ndim == 1: x = x.reshape(-1, 1) K_xx = kernel.K(x) K_xx_cho = jitchol(K_xx) cholesky_inv = np.linalg.inv(K_xx_cho) K_xx_inv = cholesky_inv.T @ cholesky_inv return K_xx, K_xx_inv
def mean_and_chol_covariance(self, X): """ :param X: :return: """ m, cov = self.predict_noiseless(X, full_cov=True) chol_cov = jitchol(cov) return m, chol_cov
def _get_YYTfactor(self, Y): """ find a matrix L which satisfies LLT = YYT. Note that L may have fewer columns than Y. """ N, D = Y.shape if (N>=D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def update_posterior(K, eta, theta): D = K.shape[0] sqrt_theta = np.sqrt(theta) G = sqrt_theta[:, None]*K B = np.identity(D) + G*sqrt_theta L = jitchol(B) V = np.linalg.solve(L, G) Sigma_full = K - np.dot(V.T, V) mu = np.dot(Sigma_full, eta) #Sigma = np.diag(Sigma_full) return posteriorParams(mu=mu, Sigma=Sigma_full, L=L)
def __init__( self, X: np.ndarray, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], kernel: GPy.kern.Kern, likelihood: Gaussian, vi_mode: str = "fr", name: str = "VIComparisonGP", max_iters: int = 50, get_logger: Callable = None, ): super(VIComparisonGP, self).__init__(name=name) self.N, self.D = X.shape[0], X.shape[1] self.output_dim = 1 self.get_logger = get_logger self.X = X self.y = y self.yc = yc self.max_iters = max_iters self.vi_mode = vi_mode self.kern = kernel self.likelihood = likelihood self.sigma2s = self.likelihood.variance * np.ones( (X.shape[0], 1), dtype=int) jitter = 1e-6 K = self.kern.K(X) L = np.linalg.cholesky(K + np.identity(K.shape[0]) * jitter) self.alpha = np.zeros((self.N, 1)) self.beta = np.ones((self.N, 1)) self.posterior = None # If we are using full rank VI, we initialize it with mean field VI if self.vi_mode == "FRVI": self.posterior, _, _, self.alpha, self.beta = vi.vi_comparison( self.X, self.y, self.yc, self.kern, self.sigma2s, self.alpha, self.beta, max_iters=50, method="mf") self.beta = choleskies._triang_to_flat_pure( jitchol(self.posterior.covariance)[None, :])
def calculate_mu_var(self, X, Y, Z, q_u_mean, q_u_chol, kern, mean_function, num_inducing, num_data, num_outputs): """ Calculate posterior mean and variance for the latent function values for use in the expectation over the likelihood """ #expand cholesky representation L = choleskies.flat_to_triang(q_u_chol) #S = linalg.ijk_ljk_to_ilk(L, L) #L.dot(L.T) S = np.empty((num_outputs, num_inducing, num_inducing)) [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)] #logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])]) logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])]) #compute mean function stuff if mean_function is not None: prior_mean_u = mean_function.f(Z) prior_mean_f = mean_function.f(X) else: prior_mean_u = np.zeros((num_inducing, num_outputs)) prior_mean_f = np.zeros((num_data, num_outputs)) #compute kernel related stuff Kmm = kern.K(Z) #Knm = kern.K(X, Z) Kmn = kern.K(Z, X) Knn_diag = kern.Kdiag(X) #Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm) Lm = linalg.jitchol(Kmm) logdetKmm = 2.*np.sum(np.log(np.diag(Lm))) Kmmi, _ = linalg.dpotri(Lm) #compute the marginal means and variances of q(f) #A = np.dot(Knm, Kmmi) A, _ = linalg.dpotrs(Lm, Kmn) #mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u) mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u) #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S), 1) v = np.empty((num_data, num_outputs)) for i in range(num_outputs): tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0) v[:,i] = np.sum(np.square(tmp),0) v += (Knn_diag - np.sum(A*Kmn,0))[:,None] #compute the KL term Kmmim = np.dot(Kmmi, q_u_mean) #KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0) KL = KLs.sum() latent_detail = LatentFunctionDetails(q_u_mean=q_u_mean, q_u_chol=q_u_chol, mean_function=mean_function, mu=mu, v=v, prior_mean_u=prior_mean_u, L=L, A=A, S=S, Kmm=Kmm, Kmmi=Kmmi, Kmmim=Kmmim, KL=KL) return latent_detail
def integral_mean_rebased(gpy_gp, prior_mean, prior_var, compute_var=False): X = gpy_gp.X Y = gpy_gp.Y n, d = X.shape[0], X.shape[1] assert prior_mean.ndim == 1 assert prior_var.ndim == 2 assert prior_mean.shape[0] == d assert prior_var.shape[0] == d assert prior_var.shape[0] == prior_var.shape[1] scaling = np.max(Y) # print(scaling) Y = np.exp(Y - scaling) mu = prior_mean # Kernel parameters w = np.exp(gpy_gp.kern.lengthscale.values) h = np.exp(gpy_gp.kern.variance.values[0]) if len(w) == 1: w = np.array([w] * d).reshape(-1) W = np.diag( w ) # Assuming isotropic covariance, build the W matrix from w parameters V = prior_var n_s = np.zeros((n, )) for i in range(n): n_s[i] = h * multivariate_normal._pdf_point_est( X[i, :], mean=mu, cov=W + V) # print(Y) c_f = np.linalg.det(2 * np.pi * (2 * W + V))**-0.5 K_xx = gpy_gp.kern.K(X) # Find the inverse of K_xx matrix via Cholesky decomposition (with jitter) K_xx_cho = jitchol(K_xx, ) choleksy_inverse = np.linalg.inv(K_xx_cho) K_xx_inv = choleksy_inverse.T @ choleksy_inverse unscaled_integral_mean = n_s.T @ K_xx_inv @ Y if compute_var: unscaled_integral_var = c_f - n_s.T @ K_xx_inv @ n_s scaled_var = np.log(unscaled_integral_var) + 2 * scaling else: scaled_var = np.nan scaled_mean = np.log(unscaled_integral_mean) + scaling return scaled_mean, scaled_var
def _compute_B_statistics(K, W): if np.any(np.isnan(W)): raise ValueError('One or more element(s) of W is NaN') W_12 = np.sqrt(W) B = np.eye(K.shape[0]) + W_12*K*W_12.T L = jitchol(B) LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0) K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25 C = np.dot(LiW12, K) Ki_W_i = K - C.T.dot(C) I_KW_i = np.eye(K.shape[0]) - np.dot(K, K_Wi_i) logdet_I_KW = 2*np.sum(np.log(np.diag(L))) return K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i
def _get_mu_L(self, x_pred: np.ndarray, N: int = None, woodbury_inv: bool = False, with_index: int = None) -> Tuple: """ Returns posterior mean and cholesky decomposition of the posterior samples :param x_pred: locations where the mean and posterior covariance are computed :param N: number of posterior samples :param woodbury_inv: boolean indicating whether the function should return woodbury_inv vector as well :param with_index: index of the specific posterior sample the function should return :return params: tuple containing the posterior means and choleskies of the covariances. Also woodbury inverses and woodbury choleskies if woodbury_inv is true """ indices = np.arange(self.samples["f"].shape[0]) if N is not None: indices = np.random.choice(indices, N) if with_index is not None: indices = np.array([with_index], dtype=int) N = len(indices) x_pred = np.atleast_2d(x_pred) f2_mu = np.empty((N, x_pred.shape[0])) f2_L = np.empty((N, x_pred.shape[0], x_pred.shape[0])) k_x1_x2 = self.kern.K(self.X, x_pred) k_x2_x2 = self.kern.K(x_pred) for ni, i in enumerate(indices): L_div_k_x1_x2 = la.solve_triangular(self.samples["L_K"][i, :, :], k_x1_x2, lower=True, overwrite_b=False) f2_mu[ni, :] = np.dot( L_div_k_x1_x2.T, self.samples["eta"][i, :]) # self.L_div_f[i,:]) f2_cov = k_x2_x2 - np.dot(L_div_k_x1_x2.T, L_div_k_x1_x2) f2_L[ni, :, :] = jitchol(f2_cov) if woodbury_inv: w_inv = np.empty((N, self.X.shape[0], self.X.shape[0])) w_chol = np.empty(( N, self.X.shape[0], )) for ni, i in enumerate(indices): L_Kinv = la.inv(self.samples["L_K"][i, :, :]) w_inv[ni, :, :] = L_Kinv.T @ L_Kinv w_chol[ni, :] = ( L_Kinv.T @ self.samples["eta"][i, :, None] )[:, 0] # (Kinv @ self.samples['eta'][i,:, None])[:, 0] # (L_Kinv.T @ self.samples['eta'][i,:, None])[:, 0] # self.L_div_f[i,:] return f2_mu, f2_L, w_inv, w_chol else: return f2_mu, f2_L
def latent_funs_cov(Z, kernel_list): """ Description: Builds the full-covariance cov[u(z),u(z)] of a Multi-output GP for a Sparse approximation :param Z: Inducing Points :param kernel_list: Kernels of u_q functions priors :return: Kuu """ Q = len(kernel_list) M,Dz = Z.shape Xdim = int(Dz/Q) Kuu = np.empty((Q, M, M)) Luu = np.empty((Q, M, M)) Kuui = np.empty((Q, M, M)) for q, kern in enumerate(kernel_list): Kuu[q, :, :] = kern.K(Z[:,q*Xdim:q*Xdim+Xdim],Z[:,q*Xdim:q*Xdim+Xdim]) Luu[q, :, :] = linalg.jitchol(Kuu[q, :, :]) Kuui[q, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[q, :, :])) return Kuu, Luu, Kuui
def gp_sample(model, x, n_samples): if len(x.shape) == 1: x = np.reshape(x, (1, -1)) n_points = 1 else: n_points = x.shape[0] # special case if we're only have 1 realisation of 1 point if n_points == 1 and n_samples == 1: m, cov = model.predict(x, full_cov=False) L = np.sqrt(cov) U = numpy_normal() return m + L * U # else general case, do things properly m, cov = model.predict(x, full_cov=True) L = jitchol(cov) U = numpy_normal(size=(n_points, n_samples)) return m + L @ U
def alpha(self): ''' Function to compute alpha = k^-1 y Args: None Returns: (array) alpha of size N x 1 ''' # compute the kernel matrix of size N x N k = self.kernel('trainSet', self.theta_, self.theta_) # compute the Cholesky factor self.chol_fact = gpl.jitchol(k) # Use triangular method to solve for alpha alp = gpl.dpotrs(self.chol_fact, self.output, lower=True)[0] return alp
def latent_funs_cov(Z, kernel_list): """ Builds the full-covariance cov[u(z),u(z)] of a Multi-output GP for a Sparse approximation :param Z: Inducing Points :param kernel_list: Kernels of u_q functions priors :return: Kuu """ Q = len(kernel_list) M, Dz = Z.shape Xdim = int(Dz / Q) #Kuu = np.zeros([Q*M,Q*M]) Kuu = np.empty((Q, M, M)) Luu = np.empty((Q, M, M)) Kuui = np.empty((Q, M, M)) for q, kern in enumerate(kernel_list): Kuu[q, :, :] = kern.K(Z[:, q * Xdim:q * Xdim + Xdim], Z[:, q * Xdim:q * Xdim + Xdim]) Kuu[q, :, :] = Kuu[ q, :, :] #+ 1.0e-6*np.eye(*Kuu[q, :, :].shape) #This line included by Juan for numerical stability Luu[q, :, :] = linalg.jitchol(Kuu[q, :, :], maxtries=10) Kuui[q, :, :], _ = linalg.dpotri(np.asfortranarray(Luu[q, :, :])) return Kuu, Luu, Kuui
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) # from ..models.sslvm import Gaussian_Gamma # if isinstance(likelihood, Gaussian_Gamma): # beta = likelihood.expectation_beta() # logL_R = -num_data*likelihood.expectation_logbeta() # else: beta = 1./np.fmax(likelihood.variance, 1e-6) logL_R = -num_data*np.log(beta) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)) dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim))/2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # if isinstance(likelihood, Gaussian_Gamma): # from scipy.special import polygamma # dL_dthetaL = ((YRY + output_dim*psi0)/2. - (dL_dpsi2R*psi2).sum() - np.trace(LLinvPsi1TYYTPsi1LLinvT))/-beta # likelihood.q_a.gradient = num_data*output_dim/2.*polygamma(1, likelihood.q_a) + dL_dthetaL/likelihood.q_b # likelihood.q_b.gradient = num_data*output_dim/(-2.*likelihood.q_b) +dL_dthetaL*(-likelihood.q_a/(likelihood.q_b*likelihood.q_b)) # else: dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict
def get_YYTfactor(self, Y): N, D = Y.shape if (N >= D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def get_YYTfactor(self, Y): N, D = Y.shape if (N>=D): return Y.view(np.ndarray) else: return jitchol(tdot(Y))
def inference(self, q_u_means, q_u_chols, X, Y, Z, kern_list, kern_list_Gdj, kern_aux, likelihood, B_list, Y_metadata, KL_scale=1.0, batch_scale=None, predictive=False, Gauss_Newton=False): M = Z.shape[0] T = len(Y) if batch_scale is None: batch_scale = [1.0] * T Ntask = [] [Ntask.append(Y[t].shape[0]) for t in range(T)] Q = len(kern_list) D = likelihood.num_output_functions(Y_metadata) Kuu, Luu, Kuui = util.latent_funs_cov(Z, kern_list) p_U = pu(Kuu=Kuu, Luu=Luu, Kuui=Kuui) q_U = qu(mu_u=q_u_means.copy(), chols_u=q_u_chols.copy()) S_u = np.empty((Q, M, M)) L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] Su_add_Kuu = np.zeros((Q, M, M)) Su_add_Kuu_chol = np.zeros((Q, M, M)) for q in range(Q): Su_add_Kuu[q, :, :] = S_u[q, :, :] + Kuu[q, :, :] Su_add_Kuu_chol[q, :, :] = linalg.jitchol(Su_add_Kuu[q, :, :]) # for every latent function f_d calculate q(f_d) and keep it as q(F): q_F = [] posteriors_F = [] f_index = Y_metadata['function_index'].flatten() d_index = Y_metadata['d_index'].flatten() for d in range(D): Xtask = X[f_index[d]] q_fd, q_U = self.calculate_q_f(X=Xtask, Z=Z, q_U=q_U, S_u=S_u, p_U=p_U, kern_list=kern_list, kern_list_Gdj=kern_list_Gdj, kern_aux=kern_aux, B=B_list, M=M, N=Xtask.shape[0], Q=Q, D=D, d=d) # Posterior objects for output functions (used in prediction) #I have to get rid of function below Posterior for it is not necessary posterior_fd = Posterior(mean=q_fd.m_fd.copy(), cov=q_fd.S_fd.copy(), K=util.conv_function_covariance( X=Xtask, B=B_list, kernel_list=kern_list, kernel_list_Gdj=kern_list_Gdj, kff_aux=kern_aux, d=d), prior_mean=np.zeros(q_fd.m_fd.shape)) posteriors_F.append(posterior_fd) q_F.append(q_fd) mu_F = [] v_F = [] for t in range(T): mu_F_task = np.empty((X[t].shape[0], 1)) v_F_task = np.empty((X[t].shape[0], 1)) for d, q_fd in enumerate(q_F): if f_index[d] == t: mu_F_task = np.hstack((mu_F_task, q_fd.m_fd)) v_F_task = np.hstack((v_F_task, q_fd.v_fd)) mu_F.append(mu_F_task[:, 1:]) v_F.append(v_F_task[:, 1:]) # posterior_Fnew for predictive if predictive: return posteriors_F # inference for rest of cases else: # Variational Expectations VE = likelihood.var_exp(Y, mu_F, v_F, Y_metadata) VE_dm, VE_dv = likelihood.var_exp_derivatives( Y, mu_F, v_F, Y_metadata, Gauss_Newton) for t in range(T): VE[t] = VE[t] * batch_scale[t] VE_dm[t] = VE_dm[t] * batch_scale[t] VE_dv[t] = VE_dv[t] * batch_scale[t] # KL Divergence KL = self.calculate_KL(q_U=q_U, Su_add_Kuu=Su_add_Kuu, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, M=M, Q=Q, D=D) # Log Marginal log(p(Y)) F = 0 for t in range(T): F += VE[t].sum() log_marginal = F - KL # Gradients and Posteriors dL_dS_u = [] dL_dmu_u = [] dL_dL_u = [] dL_dKmm = [] dL_dKmn = [] dL_dKdiag = [] posteriors = [] for q in range(Q): (dL_dmu_q, dL_dL_q, dL_dS_q, posterior_q, dL_dKqq, dL_dKdq, dL_dKdiag_q) = self.calculate_gradients( q_U=q_U, S_u=S_u, Su_add_Kuu_chol=Su_add_Kuu_chol, p_U=p_U, q_F=q_F, VE_dm=VE_dm, VE_dv=VE_dv, Ntask=Ntask, M=M, Q=Q, D=D, f_index=f_index, d_index=d_index, q=q) dL_dmu_u.append(dL_dmu_q) dL_dL_u.append(dL_dL_q) dL_dS_u.append(dL_dS_q) dL_dKmm.append(dL_dKqq) dL_dKmn.append(dL_dKdq) dL_dKdiag.append(dL_dKdiag_q) posteriors.append(posterior_q) gradients = { 'dL_dmu_u': dL_dmu_u, 'dL_dL_u': dL_dL_u, 'dL_dS_u': dL_dS_u, 'dL_dKmm': dL_dKmm, 'dL_dKmn': dL_dKmn, 'dL_dKdiag': dL_dKdiag } return log_marginal, gradients, posteriors, posteriors_F
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c, indexD, output_dim): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim, Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) grad_dict = self._init_grad_dict(N, D, Mr, Mc) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(D) + beta psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() mid_res = { 'psi0_r': psi0_r, 'psi1_r': psi1_r, 'psi2_r': psi2_r, 'psi0_c': psi0_c, 'psi1_c': psi1_c, 'psi2_c': psi2_c, 'Lr': Lr, 'Lc': Lc, 'LcInvMLrInvT': LcInvMLrInvT, 'LcInvScLcInvT': LcInvScLcInvT, 'LrInvSrLrInvT': LrInvSrLrInvT, } #====================================================================== # Compute log-likelihood #====================================================================== logL = 0. for d in range(D): logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res, uncertain_inputs_r, uncertain_inputs_c, Mr, Mc) logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = tdot( LcInvMLrInvT ) / 2. + tr_LrInvSrLrInvT / 2. * LcInvScLcInvT - Mr / 2. * np.eye(Mc) dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = tdot( LcInvMLrInvT.T ) / 2. + tr_LcInvScLcInvT / 2. * LrInvSrLrInvT - Mc / 2. * np.eye(Mr) dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dqU #====================================================================== tmp = -LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -tr_LrInvSrLrInvT / 2. * np.eye(Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -tr_LcInvScLcInvT / 2. * np.eye(Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== grad_dict['dL_dqU_mean'] += dL_dqU_mean grad_dict['dL_dqU_var_c'] += dL_dqU_var_c grad_dict['dL_dqU_var_r'] += dL_dqU_var_r grad_dict['dL_dKuu_c'] += dL_dKuu_c grad_dict['dL_dKuu_r'] += dL_dKuu_r if not uncertain_inputs_c: grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c'] grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c'] if not uncertain_inputs_r: grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r'] grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r'] return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y)*1).sum(1) Ymask = 1-np.isnan(Y)*1 Y_masked = np.zeros_like(Y) Y_masked[Ymask==1] = Y[Ymask==1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N*D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None,:,:]+mu.T[:,:,None]*mu.T[:,None,:] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T),1,2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T),1,2).dot(LmInv.T) B = mupsi1Y+ mupsi1Y.T +(Ds[:,None,None]*psi2).sum(0) tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S*D+tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y+ mupsi1Y.T +D*psi2 tmp = backsub_both_sides(Lm, B,'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N,)))/2. else: dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: Ym,Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T*beta else: if missing_data: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T*beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T*beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes((Ds[:,None,None]*np.eye(M)[None,:,:]-LmInvSmuLmInvT).dot(LmInv),1,2).dot(LmInv)*beta/2. else: dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. else: dL_dpsi1 += beta*psi1.dot(dL_dpsi2+dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) #LmInv = dtrtri(Lm) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) # LLInv = dtrtri(LL) # LmLLInv = LLInv.dot(LmInv) logdet_L = 2. * np.sum(np.log(np.diag(LL))) b = dtrtrs(LmLL, psi1Y.T)[0].T #psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T #b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = dtrtrs(LmLL, psi1S.T)[0].T #psi1S.dot(LmLLInv.T) bbt += np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT += tdot(psi1SLLinv.T) psi1SP = dtrtrs(LmLL, psi1SLLinv.T, trans=1)[0].T #psi1SLLinv.dot(LmLLInv) tmp = -backsub_both_sides( LL, LLinvPsi1TYYTPsi1LLinvT + output_dim * np.eye(input_dim)) dL_dpsi2R = backsub_both_sides( Lm, tmp + output_dim * np.eye(input_dim)) / 2 #tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) #dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data * np.log(beta) logL = -( output_dim * (num_data * log_2_pi + logL_R + psi0 - np.trace(LmInvPsi2LmInvT)) + YRY - bbt) / 2. - output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim * backsub_both_sides( Lm, LmInvPsi2LmInvT) / 2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY * beta + beta * output_dim * psi0 - num_data * output_dim * beta) / 2. - beta * (dL_dpsi2R * psi2).sum( ) - beta * np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data, ))) / 2. if uncertain_outputs: m, s = Y.mean, Y.variance dL_dpsi1 = beta * (np.dot(m, v) + Shalf[:, None] * psi1SP) else: dL_dpsi1 = beta * np.dot(Y, v) if uncertain_inputs: dL_dpsi2 = beta * dL_dpsi2R else: dL_dpsi1 += np.dot(psi1, dL_dpsi2R) * 2. dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: m, s = Y.mean, Y.variance psi1LmiLLi = dtrtrs(LmLL, psi1.T)[0].T #psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m * beta + psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta / -2. + np.square(psi1LmiLLi).sum( axis=1) / 2 return post, logL, grad_dict
def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean, qU_var_r, qU_var_c): """ The SVI-VarDTC inference """ N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[ 0], Zr.shape[1], Zc.shape[1] uncertain_inputs_r = isinstance(Xr, VariationalPosterior) uncertain_inputs_c = isinstance(Xc, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) #====================================================================== # Compute Common Components #====================================================================== Kuu_r = kern_r.K(Zr).copy() diag.add(Kuu_r, self.const_jitter) Lr = jitchol(Kuu_r) Kuu_c = kern_c.K(Zc).copy() diag.add(Kuu_c, self.const_jitter) Lc = jitchol(Kuu_c) mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c LSr = jitchol(Sr) LSc = jitchol(Sc) LcInvMLrInvT = dtrtrs(Lc, dtrtrs(Lr, mu.T)[0].T)[0] LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c, 'right') LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r, 'right') LcInvLSc = dtrtrs(Lc, LSc)[0] LrInvLSr = dtrtrs(Lr, LSr)[0] LcInvScLcInvT = tdot(LcInvLSc) LrInvSrLrInvT = tdot(LrInvLSr) LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT * LrInvSrLrInvT).sum() tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT * LcInvScLcInvT).sum() tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() tr_LcInvScLcInvT = np.square(LcInvLSc).sum() tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) #====================================================================== # Compute log-likelihood #====================================================================== logL_A = - np.square(Y).sum() \ - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \ -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. #====================================================================== # Compute dL_dKuu #====================================================================== tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \ + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') dL_dKuu_c += dL_dKuu_c.T dL_dKuu_c *= 0.5 tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \ + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') dL_dKuu_r += dL_dKuu_r.T dL_dKuu_r *= 0.5 #====================================================================== # Compute dL_dthetaL #====================================================================== dL_dthetaL = -D * N * beta / 2. - logL_A * beta * beta / 2. #====================================================================== # Compute dL_dqU #====================================================================== tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] LScInv = dtrtri(LSc) tmp = -beta / 2. * tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT - tr_LrInvSrLrInvT / 2. * np.eye( Mc) dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr / 2. LSrInv = dtrtri(LSr) tmp = -beta / 2. * tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT - tr_LcInvScLcInvT / 2. * np.eye( Mr) dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc / 2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0_r = -psi0_c * beta / 2. * np.ones((D, )) dL_dpsi0_c = -psi0_r * beta / 2. * np.ones((N, )) dL_dpsi1_c = beta * dtrtrs( Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T dL_dpsi1_r = beta * dtrtrs( Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T tmp = beta / 2. * ( -LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + tr_LrInvPsi2_rLrInvT * np.eye(Mc)) dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') tmp = beta / 2. * ( -LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + tr_LcInvPsi2_cLcInvT * np.eye(Mr)) dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') if not uncertain_inputs_r: dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r + dL_dpsi2_r.T) if not uncertain_inputs_c: dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c + dL_dpsi2_c.T) grad_dict = { 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var_c': dL_dqU_var_c, 'dL_dqU_var_r': dL_dqU_var_r, 'dL_dKuu_c': dL_dKuu_c, 'dL_dKuu_r': dL_dKuu_r, } if uncertain_inputs_c: grad_dict['dL_dpsi0_c'] = dL_dpsi0_c grad_dict['dL_dpsi1_c'] = dL_dpsi1_c grad_dict['dL_dpsi2_c'] = dL_dpsi2_c else: grad_dict['dL_dKdiag_c'] = dL_dpsi0_c grad_dict['dL_dKfu_c'] = dL_dpsi1_c if uncertain_inputs_r: grad_dict['dL_dpsi0_r'] = dL_dpsi0_r grad_dict['dL_dpsi1_r'] = dL_dpsi1_r grad_dict['dL_dpsi2_r'] = dL_dpsi2_r else: grad_dict['dL_dKdiag_r'] = dL_dpsi0_r grad_dict['dL_dKfu_r'] = dL_dpsi1_r return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ input_dim = Z.shape[0] uncertain_inputs = isinstance(X, VariationalPosterior) beta = 1. / likelihood.variance if len(beta) == 1: beta = np.zeros(output_dim) + beta beta_exp = np.zeros(indexD.shape[0]) for d in range(output_dim): beta_exp[indexD == d] = beta[d] psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) psi2_sum = (beta_exp[:, None, None] * psi2).sum(0) / output_dim #====================================================================== # Compute Common Components #====================================================================== Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) logL = 0. dL_dthetaL = np.zeros(output_dim) dL_dKmm = np.zeros_like(Kmm) dL_dpsi0 = np.zeros_like(psi0) dL_dpsi1 = np.zeros_like(psi1) dL_dpsi2 = np.zeros_like(psi2) wv = np.empty((Kmm.shape[0], output_dim)) for d in range(output_dim): idx_d = indexD == d Y_d = Y[idx_d] N_d = Y_d.shape[0] beta_d = beta[d] psi2_d = psi2[idx_d].sum(0) * beta_d psi1Y = Y_d.T.dot(psi1[idx_d]) * beta_d psi0_d = psi0[idx_d].sum() * beta_d YRY_d = np.square(Y_d).sum() * beta_d LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) b = dtrtrs(LmLL, psi1Y.T)[0].T bbt = np.square(b).sum() v = dtrtrs(LmLL, b.T, trans=1)[0].T LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) dL_dpsi2R = backsub_both_sides(Lm, tmp + np.eye(input_dim)) / 2 logL_R = -N_d * np.log(beta_d) logL += -((N_d * log_2_pi + logL_R + psi0_d - np.trace(LmInvPsi2LmInvT)) + YRY_d - bbt) / 2. dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT) / 2 dL_dthetaL[d:d + 1] = (YRY_d * beta_d + beta_d * psi0_d - N_d * beta_d) / 2. - beta_d * (dL_dpsi2R * psi2_d).sum( ) - beta_d * np.trace(LLinvPsi1TYYTPsi1LLinvT) dL_dpsi0[idx_d] = -beta_d / 2. dL_dpsi1[idx_d] = beta_d * np.dot(Y_d, v) dL_dpsi2[idx_d] = beta_d * dL_dpsi2R wv[:, d] = v LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') Lambda = np.eye(Kmm.shape[0]) + LmInvPsi2LmInvT LL = jitchol(Lambda) LmLL = Lm.dot(LL) logdet_L = 2. * np.sum(np.log(np.diag(LL))) dL_dpsi2R_common = dpotri(LmLL)[0] / -2. dL_dpsi2 += dL_dpsi2R_common[None, :, :] * beta_exp[:, None, None] for d in range(output_dim): dL_dthetaL[d] += (dL_dpsi2R_common * psi2[indexD == d].sum(0) ).sum() * -beta[d] * beta[d] dL_dKmm += dL_dpsi2R_common * output_dim logL += -output_dim * logdet_L / 2. #====================================================================== # Compute dL_dKmm #====================================================================== # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== LLInvLmT = dtrtrs(LL, Lm.T)[0] cov = tdot(LLInvLmT.T) wd_inv = backsub_both_sides( Lm, np.eye(input_dim) - backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== # for d in range(output_dim): # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() #====================================================================== # Compute dL_dpsi #====================================================================== if not uncertain_inputs: dL_dpsi1 += (psi1[:, None, :] * dL_dpsi2).sum(2) * 2. if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU): """ The SVI-VarDTC inference """ if isinstance(Y, np.ndarray) and np.any(np.isnan(Y)): missing_data = True N, M, Q = Y.shape[0], Z.shape[0], Z.shape[1] Ds = Y.shape[1] - (np.isnan(Y) * 1).sum(1) Ymask = 1 - np.isnan(Y) * 1 Y_masked = np.zeros_like(Y) Y_masked[Ymask == 1] = Y[Ymask == 1] ND = Ymask.sum() else: missing_data = False N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] ND = N * D uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y if not missing_data else Y_masked, beta, uncertain_inputs, D if not missing_data else Ds, missing_data) #====================================================================== # Compute Common Components #====================================================================== mu, S = qU.mean, qU.covariance mupsi1Y = mu.dot(psi1Y) Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) if missing_data: S_mu = S[None, :, :] + mu.T[:, :, None] * mu.T[:, None, :] NS_mu = S_mu.T.dot(Ymask.T).T LmInv = dtrtri(Lm) LmInvPsi2LmInvT = np.swapaxes(psi2.dot(LmInv.T), 1, 2).dot(LmInv.T) LmInvSmuLmInvT = np.swapaxes(NS_mu.dot(LmInv.T), 1, 2).dot(LmInv.T) B = mupsi1Y + mupsi1Y.T + (Ds[:, None, None] * psi2).sum(0) tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. else: S_mu = S * D + tdot(mu) if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs( Lm, psi1.T)[0]) / beta #tdot(psi1.dot(LmInv.T).T) /beta LmInvSmuLmInvT = backsub_both_sides(Lm, S_mu, 'right') B = mupsi1Y + mupsi1Y.T + D * psi2 tmp = backsub_both_sides(Lm, B, 'right') logL = -ND*log_2_pi/2. +ND*np.log(beta)/2. - psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. +np.trace(tmp)/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = np.eye(M) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = None #(YRY*beta + beta*output_dim*psi0 - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== if missing_data: dL_dpsi0 = -Ds * (beta * np.ones((N, ))) / 2. else: dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: Ym, Ys = Y.mean, Y.variance dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Ym.dot(mu.T).T)[0], trans=1)[0].T * beta else: if missing_data: dL_dpsi1 = dtrtrs( Lm, dtrtrs(Lm, (Y_masked).dot(mu.T).T)[0], trans=1)[0].T * beta else: dL_dpsi1 = dtrtrs(Lm, dtrtrs(Lm, Y.dot(mu.T).T)[0], trans=1)[0].T * beta if uncertain_inputs: if missing_data: dL_dpsi2 = np.swapaxes( (Ds[:, None, None] * np.eye(M)[None, :, :] - LmInvSmuLmInvT).dot(LmInv), 1, 2).dot(LmInv) * beta / 2. else: dL_dpsi2 = beta * backsub_both_sides( Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. else: dL_dpsi1 += beta * psi1.dot(dL_dpsi2 + dL_dpsi2.T) dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL } if uncertain_outputs: Ym = Y.mean grad_dict['dL_dYmean'] = -Ym * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean ,qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm,psi1Y.T)[0].T self.mid = { 'qU_L': Ls, 'LinvLu': LinvLs, 'L':Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0])/beta LmInvSmuLmInvT = tdot(LinvLs)*D+tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N*np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm,LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, - D*LmInvPsi2LmInvT -tmp2-tmp2.T, 'left')/2. dL_dKmm = (tmp1+tmp1.T)/2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D*N*beta/2. -(- D*psi0/2. - YRY/2.-(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum())*beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, - LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T,trans=1)[0] dL_dqU_var = D/2.*tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N,)))/2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm,Linvmu,trans=1)[0].T)*beta dL_dpsi2 = beta*backsub_both_sides(Lm, D*np.eye(M)-LmInvSmuLmInvT, 'left')/2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2+dL_dpsi2.T)/beta dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL, 'dL_dqU_mean':dL_dqU_mean, 'dL_dqU_var':dL_dqU_var} if uncertain_outputs: m,s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m*beta+ dtrtrs(Lm,psi1.T)[0].T.dot(dtrtrs(Lm,mu)[0]) grad_dict['dL_dYvar'] = beta/-2. return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None): assert mean_function is None, "inference with a mean function not implemented" num_inducing, _ = Z.shape num_data, output_dim = Y.shape #make sure the noise is not hetero sigma_n = likelihood.gaussian_variance(Y_metadata) if sigma_n.size >1: raise NotImplementedError("no hetero noise with this implementation of PEP") Kmm = kern.K(Z) Knn = kern.Kdiag(X) Knm = kern.K(X, Z) U = Knm #factor Kmm diag.add(Kmm, self.const_jitter) Kmmi, L, Li, _ = pdinv(Kmm) #compute beta_star, the effective noise precision LiUT = np.dot(Li, U.T) sigma_star = sigma_n + self.alpha * (Knn - np.sum(np.square(LiUT),0)) beta_star = 1./sigma_star # Compute and factor A A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing) LA = jitchol(A) # back substitute to get b, P, v URiy = np.dot(U.T*beta_star,Y) tmp, _ = dtrtrs(L, URiy, lower=1) b, _ = dtrtrs(LA, tmp, lower=1) tmp, _ = dtrtrs(LA, b, lower=1, trans=1) v, _ = dtrtrs(L, tmp, lower=1, trans=1) tmp, _ = dtrtrs(LA, Li, lower=1, trans=0) P = tdot(tmp.T) alpha_const_term = (1.0-self.alpha) / self.alpha #compute log marginal log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \ -np.sum(np.log(np.diag(LA)))*output_dim + \ 0.5*output_dim*(1+alpha_const_term)*np.sum(np.log(beta_star)) + \ -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \ 0.5*np.sum(np.square(b)) + 0.5*alpha_const_term*num_data*np.log(sigma_n) #compute dL_dR Uv = np.dot(U, v) dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \ + np.sum(np.square(Uv), 1))*beta_star**2 # Compute dL_dKmm vvT_P = tdot(v.reshape(-1,1)) + P dL_dK = 0.5*(Kmmi - vvT_P) KiU = np.dot(Kmmi, U.T) dL_dK += self.alpha * np.dot(KiU*dL_dR, KiU.T) # Compute dL_dU vY = np.dot(v.reshape(-1,1),Y.T) dL_dU = vY - np.dot(vvT_P, U.T) dL_dU *= beta_star dL_dU -= self.alpha * 2.*KiU*dL_dR dL_dthetaL = likelihood.exact_inference_gradients(dL_dR) dL_dthetaL += 0.5*alpha_const_term*num_data / sigma_n grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR * self.alpha, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL} #construct a posterior object post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L) return post, log_marginal, grad_dict
def inference(self, kern, X, Z, likelihood, Y, qU_mean, qU_var, Kuu_sigma=None): """ The SVI-VarDTC inference """ N, D, M, Q = Y.shape[0], Y.shape[1], Z.shape[0], Z.shape[1] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1. / likelihood.variance psi0, psi2, YRY, psi1, psi1Y = self.gatherPsiStat( kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== Kuu = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kuu, Kuu_sigma) else: diag.add(Kuu, self.const_jitter) Lm = jitchol(Kuu) mu, S = qU_mean, qU_var Ls = jitchol(S) LinvLs = dtrtrs(Lm, Ls)[0] Linvmu = dtrtrs(Lm, mu)[0] psi1YLinvT = dtrtrs(Lm, psi1Y.T)[0].T self.mid = {'qU_L': Ls, 'LinvLu': LinvLs, 'L': Lm, 'Linvmu': Linvmu} if uncertain_inputs: LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2, 'right') else: LmInvPsi2LmInvT = tdot(dtrtrs(Lm, psi1.T)[0]) / beta LmInvSmuLmInvT = tdot(LinvLs) * D + tdot(Linvmu) # logdet_L = np.sum(np.log(np.diag(Lm))) # logdet_S = np.sum(np.log(np.diag(Ls))) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -N * np.log(beta) logL = -N*D*log_2_pi/2. -D*logL_R/2. - D*psi0/2. - YRY/2. \ -(LmInvSmuLmInvT*LmInvPsi2LmInvT).sum()/2. + np.trace(LmInvPsi2LmInvT)*D/2.+(Linvmu*psi1YLinvT.T).sum() #====================================================================== # Compute dL_dKmm #====================================================================== tmp1 = backsub_both_sides(Lm, LmInvSmuLmInvT.dot(LmInvPsi2LmInvT), 'left') tmp2 = Linvmu.dot(psi1YLinvT) tmp3 = backsub_both_sides(Lm, -D * LmInvPsi2LmInvT - tmp2 - tmp2.T, 'left') / 2. dL_dKmm = (tmp1 + tmp1.T) / 2. + tmp3 #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = -D * N * beta / 2. - ( -D * psi0 / 2. - YRY / 2. - (LmInvSmuLmInvT * LmInvPsi2LmInvT).sum() / 2. + np.trace(LmInvPsi2LmInvT) * D / 2. + (Linvmu * psi1YLinvT.T).sum()) * beta #====================================================================== # Compute dL_dqU #====================================================================== tmp1 = backsub_both_sides(Lm, -LmInvPsi2LmInvT, 'left') dL_dqU_mean = tmp1.dot(mu) + dtrtrs(Lm, psi1YLinvT.T, trans=1)[0] dL_dqU_var = D / 2. * tmp1 #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== KuuInvmu = dtrtrs(Lm, Linvmu, trans=1)[0] tmp = backsub_both_sides(Lm, np.eye(M) - tdot(LinvLs), 'left') post = Posterior(woodbury_inv=tmp, woodbury_vector=KuuInvmu, K=Kuu, mean=mu, cov=S, K_chol=Lm) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -D * (beta * np.ones((N, ))) / 2. if uncertain_outputs: dL_dpsi1 = Y.mean.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta else: dL_dpsi1 = Y.dot(dtrtrs(Lm, Linvmu, trans=1)[0].T) * beta dL_dpsi2 = beta * backsub_both_sides(Lm, D * np.eye(M) - LmInvSmuLmInvT, 'left') / 2. if not uncertain_inputs: dL_dpsi1 += psi1.dot(dL_dpsi2 + dL_dpsi2.T) / beta dL_dpsi2 = None if uncertain_inputs: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dpsi0': dL_dpsi0, 'dL_dpsi1': dL_dpsi1, 'dL_dpsi2': dL_dpsi2, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } else: grad_dict = { 'dL_dKmm': dL_dKmm, 'dL_dKdiag': dL_dpsi0, 'dL_dKnm': dL_dpsi1, 'dL_dthetaL': dL_dthetaL, 'dL_dqU_mean': dL_dqU_mean, 'dL_dqU_var': dL_dqU_var } if uncertain_outputs: m, s = Y.mean, Y.variance grad_dict['dL_dYmean'] = -m * beta + dtrtrs(Lm, psi1.T)[0].T.dot( dtrtrs(Lm, mu)[0]) grad_dict['dL_dYvar'] = beta / -2. return post, logL, grad_dict
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def inference_root(self, kern, X, Z, likelihood, Y, Kuu_sigma=None, Y_metadata=None, Lm=None, dL_dKmm=None): """ The first phase of inference: Compute: log-likelihood, dL_dKmm Cached intermediate results: Kmm, KmmInv, """ num_data, output_dim = Y.shape input_dim = Z.shape[0] num_data_total = allReduceArrays([np.int32(num_data)], self.mpi_comm)[0] uncertain_inputs = isinstance(X, VariationalPosterior) uncertain_outputs = isinstance(Y, VariationalPosterior) beta = 1./np.fmax(likelihood.variance, 1e-6) psi0, psi2, YRY, psi1, psi1Y, Shalf, psi1S = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components #====================================================================== try: Kmm = kern.K(Z).copy() if Kuu_sigma is not None: diag.add(Kmm, Kuu_sigma) else: diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) LmInv = dtrtri(Lm) LmInvPsi2LmInvT = LmInv.dot(psi2.dot(LmInv.T)) Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT LL = jitchol(Lambda) LLInv = dtrtri(LL) flag = np.zeros((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) except LinAlgError as e: flag = np.ones((1,),dtype=np.int32) self.mpi_comm.Bcast(flag,root=self.root) raise e broadcastArrays([LmInv, LLInv],self.mpi_comm, self.root) LmLLInv = LLInv.dot(LmInv) logdet_L = 2.*np.sum(np.log(np.diag(LL))) b = psi1Y.dot(LmLLInv.T) bbt = np.square(b).sum() v = b.dot(LmLLInv) LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) if psi1S is not None: psi1SLLinv = psi1S.dot(LmLLInv.T) bbt_sum = np.square(psi1SLLinv).sum() LLinvPsi1TYYTPsi1LLinvT_sum = tdot(psi1SLLinv.T) bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum = reduceArrays([bbt_sum, LLinvPsi1TYYTPsi1LLinvT_sum], self.mpi_comm, self.root) bbt += bbt_sum LLinvPsi1TYYTPsi1LLinvT += LLinvPsi1TYYTPsi1LLinvT_sum psi1SP = psi1SLLinv.dot(LmLLInv) tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv) dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2. broadcastArrays([dL_dpsi2R], self.mpi_comm, self.root) #====================================================================== # Compute log-likelihood #====================================================================== logL_R = -num_data_total*np.log(beta) logL = -(output_dim*(num_data_total*log_2_pi+logL_R+psi0-np.trace(LmInvPsi2LmInvT))+YRY- bbt)/2.-output_dim*logdet_L/2. #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = dL_dpsi2R - output_dim* LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v.T, K=Kmm, mean=None, cov=None, K_chol=Lm) #====================================================================== # Compute dL_dthetaL for uncertian input and non-heter noise #====================================================================== dL_dthetaL = (YRY*beta + beta*output_dim*psi0 - num_data_total*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2).sum() - beta*np.trace(LLinvPsi1TYYTPsi1LLinvT) #====================================================================== # Compute dL_dpsi #====================================================================== dL_dpsi0 = -output_dim * (beta * np.ones((num_data,)))/2. if uncertain_outputs: m,s = Y.mean, Y.variance dL_dpsi1 = beta*(np.dot(m,v)+Shalf[:,None]*psi1SP) else: dL_dpsi1 = beta*np.dot(Y,v) if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(psi1,dL_dpsi2R)*2. dL_dpsi2 = None if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if uncertain_outputs: m,s = Y.mean, Y.variance psi1LmiLLi = psi1.dot(LmLLInv.T) LLiLmipsi1Y = b.T grad_dict['dL_dYmean'] = -m*beta+ psi1LmiLLi.dot(LLiLmipsi1Y) grad_dict['dL_dYvar'] = beta/-2.+ np.square(psi1LmiLLi).sum(axis=1)/2 return post, logL, grad_dict
def maximization(self, Y, K, C, t, parameters, hyperparameters, expectations): self.N = Y.shape[0] self.T = Y.shape[1] # Model parameters pi = parameters[0].copy() f = parameters[1].copy() mu = parameters[2].copy() # Model hyperparameters ls = hyperparameters[0].copy() a0 = hyperparameters[1].copy() a = hyperparameters[2].copy() b = hyperparameters[3].copy() sigmas = hyperparameters[4].copy() var_precision = sigmas.shape[0] # Expected values r_ik = expectations['r_ik'] #c_ik = expectations['c_ik'] Y_exp = expectations['Y_exp'] matrices = expectations['matrices'] # old building of matrices Sold = matrices['S_old'] Lold = matrices['L_old'] Siold = matrices['Si_old'] # new building of matrices hyperparam_list = [ls, a0, a, b, sigmas] S, L, Si = util.build_covariance(t, K, hyperparam_list) #dims: (T,T,K) # Identifiying missing (NaN) values nans = np.isnan(Y[:,:,0]) notnans = np.invert(nans) # Expected Log-Likelihood (Cost Function) log_likelihood = 0.0 het_logpdf = np.empty((self.N, K)) # Log-likelihood derivatives wrt hyperparameters dL_dl = np.zeros((1, K)) dL_da0 = np.zeros((1, K)) dL_da = np.zeros((C, K)) dL_db = np.zeros((C, K)) dL_dsigmas = np.zeros((var_precision, 1)) c_ik = np.empty((self.N, K)) for k in range(K): S_k = S[:, :, k] # new Si_k = Si[:, :, k] # new Sold_k = Sold[:, :, k] # old Siold_k = Siold[:, :, k] # old Y_exp_k = Y_exp[k] Y_exp_real = Y_exp_k[:, :, 0] Y_exp_bin = Y_exp_k[:, :, 1] detS_k = np.linalg.det(S_k) for i in range(self.N): Sold_k_oo = Sold_k[np.ix_(notnans[i,:], notnans[i,:])] Sold_k_mm = Sold_k[np.ix_(nans[i,:], nans[i,:])] Sold_k_mo = Sold_k[np.ix_(nans[i,:], notnans[i,:])] Sold_k_om = Sold_k_mo.T Si_k_mm = Si_k[np.ix_(nans[i,:], nans[i,:])] # mm submatrix of Si_k Lold_k_oo = linalg.jitchol(Sold_k_oo) iSold_k_oo, _ = linalg.dpotri(np.asfortranarray(Lold_k_oo)) # inverse of oo submatrix Cov_m = Sold_k_mm - (Sold_k_mo.dot(iSold_k_oo).dot(Sold_k_om)) c_ik[i,k] = np.trace(Si_k_mm.dot(Cov_m)) A_m = np.zeros((self.T, self.T)) A_m[np.ix_(nans[i, :], nans[i, :])] = Cov_m y = Y_exp_real[i, :].T y = y[:, np.newaxis] yy_T = np.dot(y,y.T) aa_T = Si_k.dot(yy_T).dot(Si_k.T) Q1 = aa_T - Si_k Q2 = Si_k.dot(A_m).dot(Si_k) dK_dl, dK_da0, dK_da, dK_db, dK_dsigmas = self.kernel_gradients(Q1, Q2, t, k, C, hyperparam_list) dL_dl[0,k] += 0.5*r_ik[i,k]*dK_dl dL_da0[0, k] += 0.5*r_ik[i,k]*dK_da0 dL_da[:,k] += 0.5*r_ik[i,k]*dK_da.flatten() dL_db[:,k] += 0.5*r_ik[i,k]*dK_db.flatten() dL_dsigmas += 0.5*r_ik[i,k]*dK_dsigmas log_likelihood += - 0.5*r_ik[i,k]*np.log(pi[0,k]) - 0.5*r_ik[i,k]*np.log(detS_k) \ - 0.5*r_ik[i,k] * np.dot(Y_exp_real[i,:],Si_k).dot(Y_exp_real[i,:].T) \ - 0.5*r_ik[i,k]*c_ik[i,k] \ + r_ik[i,k]*np.sum(Y_exp_bin[i,:]*np.log(mu[:, k])) \ + r_ik[i,k]*np.sum(Y_exp_bin[i,:]*np.log(1 - mu[:, k])) # + r_ik[i,k]*[] # falta el pi de la gaussian #param_list = [f[:, k], S[:, :, k], Si[:, :, k], mu[:, k]] gradients = {'dL_dl':dL_dl, 'dL_da0':dL_da0, 'dL_da':dL_da, 'dL_db':dL_db, 'dL_dsigmas':dL_dsigmas} return log_likelihood, gradients