def bound(self): """ Compute the lower bound on the marginal likelihood (conditioned on the GP hyper parameters). """ GP_bound = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K+B_inv) # Data fit # alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) # GP_bound += -0.5 * np.dot(self.Y.T, alpha).trace() GP_bound -= .5 * dpotrs(LB, self.YYT)[0].trace() # Penalty # GP_bound += -0.5 * np.linalg.slogdet(K + B_inv)[1] GP_bound -= 0.5 * Blogdet # Constant, weighted by model assignment per point #GP_bound += -0.5 * (self.phi[:, i] * np.log(2 * np.pi * self.variance)).sum() GP_bound -= .5*self.D * np.einsum('j,j->',self.phi[:, i], np.log(2 * np.pi * self.variance)) return GP_bound + self.mixing_prop_bound() + self.H
def do_computations(self): """ Here we do all the computations that are required whenever the kernels or the variational parameters are changed. """ # sufficient stats. self.ybark = np.dot(self.phi.T, self.Y).T # compute posterior variances of each cluster (lambda_inv) tmp = backsub_both_sides(self.Sy_chol, self.Sf, transpose="right") self.Cs = [np.eye(self.D) + tmp * phi_hat_i for phi_hat_i in self.phi_hat] self._C_chols = [jitchol(C) for C in self.Cs] self.log_det_diff = np.array([2.0 * np.sum(np.log(np.diag(L))) for L in self._C_chols]) tmp = [dtrtrs(L, self.Sy_chol.T, lower=1)[0] for L in self._C_chols] self.Lambda_inv = np.array( [ (self.Sy - np.dot(tmp_i.T, tmp_i)) / phi_hat_i if (phi_hat_i > 1e-6) else self.Sf for phi_hat_i, tmp_i in zip(self.phi_hat, tmp) ] ) # posterior mean and other useful quantities self.Syi_ybark, _ = dpotrs(self.Sy_chol, self.ybark, lower=1) self.Syi_ybarkybarkT_Syi = self.Syi_ybark.T[:, None, :] * self.Syi_ybark.T[:, :, None] self.muk = (self.Lambda_inv * self.Syi_ybark.T[:, :, None]).sum(1).T
def add_new_data_point(self, x, y): """ Add a new function observation to the GP. Parameters ---------- x: 2d-array y: 2d-array """ x = np.atleast_2d(x) y = np.atleast_2d(y) if self.gp is None: # Initialize GP # inference_method = GPy.inference.latent_function_inference.\ # exact_gaussian_inference.ExactGaussianInference() self.gp = GPy.core.GP(X=x, Y=y, kernel=self.kernel, # inference_method=inference_method, likelihood=self.likelihood) else: # Add data to GP # self.gp.set_XY(np.vstack([self.gp.X, x]), # np.vstack([self.gp.Y, y])) # Add data row/col to kernel (a, b) # [ K a ] # [ a.T b ] # # Now K = L.dot(L.T) # The new Cholesky decomposition is then # L_new = [ L 0 ] # [ c.T d ] a = self.gp.kern.K(self.gp.X, x) b = self.gp.kern.K(x, x) b += 1e-8 + self.gp.likelihood.gaussian_variance( self.gp.Y_metadata) L = self.gp.posterior.woodbury_chol c = sp.linalg.solve_triangular(self.gp.posterior.woodbury_chol, a, lower=True) d = np.sqrt(b - c.T.dot(c)) L_new = np.asfortranarray( np.bmat([[L, np.zeros_like(c)], [c.T, d]])) K_new = np.bmat([[self.gp.posterior._K, a], [a.T, b]]) self.gp.X = np.vstack((self.gp.X, x)) self.gp.Y = np.vstack((self.gp.Y, y)) alpha, _ = dpotrs(L_new, self.gp.Y, lower=1) self.gp.posterior = Posterior(woodbury_chol=L_new, woodbury_vector=alpha, K=K_new) # Increment time step self.t += 1
def woodbury_inv(self): """ The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as $$ (K_{xx} + \Sigma_{xx})^{-1} \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance} $$ """ if self._woodbury_inv is None: if self._woodbury_chol is not None: self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1) symmetrify(self._woodbury_inv) elif self._covariance is not None: B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance) self._woodbury_inv = np.empty_like(B) for i in range(B.shape[-1]): tmp, _ = dpotrs(self.K_chol, B[:, :, i]) self._woodbury_inv[:, :, i], _ = dpotrs(self.K_chol, tmp.T) return self._woodbury_inv
def _bias_loss(self, c): # calculate mean and norm for new bias via a new woodbury_vector new_woodbury_vector, _ = dpotrs(self._woodbury_chol, self._Y - c, lower=1) K = self.gp.kern.K(self.gp.X) mean = np.dot(K, new_woodbury_vector) norm = new_woodbury_vector.T.dot(mean) # loss is least_squares_error + norm return np.asscalar(np.sum(np.square(mean + c - self._Y)) + norm)
def woodbury_vector(self): """ Woodbury vector in the gaussian likelihood case only is defined as $$ (K_{xx} + \Sigma)^{-1}Y \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance} $$ """ if self._woodbury_vector is None: self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean) return self._woodbury_vector
def update_kern_grads(self): """ Set the derivative of the lower bound wrt the (kernel) parameters """ grad_Lm_variance = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) # Numerically more stable version using cholesky decomposition #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dK = .5*(tdot(alpha) - K_B_inv) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K + B_inv) tmp = dpotrs(LB, self.YYT)[0] GPy.util.diag.subtract(tmp, 1) dL_dB = dpotrs(LB, tmp.T)[0] kern.update_gradients_full(dL_dK=.5 * dL_dB, X=self.X) # variance gradient #for i, kern in enumerate(self.kern): K = kern.K(self.X) #I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) #alpha = np.linalg.solve(K + B_inv, self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dB = tdot(alpha) - K_B_inv grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6)) grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_Lm_variance -= .5 * self.D * np.einsum( 'j,j->', self.phi[:, i], 1. / self.variance) self.variance.gradient = grad_Lm_variance
def update_kern_grads(self): """ Set the derivative of the lower bound wrt the (kernel) parameters """ grad_Lm_variance = 0.0 for i, kern in enumerate(self.kern): K = kern.K(self.X) B_inv = np.diag(1. / (self.phi[:, i] / self.variance)) # Numerically more stable version using cholesky decomposition #alpha = linalg.cho_solve(linalg.cho_factor(K + B_inv), self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dK = .5*(tdot(alpha) - K_B_inv) # Make more stable using cholesky factorization: Bi, LB, LBi, Blogdet = pdinv(K+B_inv) tmp = dpotrs(LB, self.YYT)[0] GPy.util.diag.subtract(tmp, 1) dL_dB = dpotrs(LB, tmp.T)[0] kern.update_gradients_full(dL_dK=.5*dL_dB, X=self.X) # variance gradient #for i, kern in enumerate(self.kern): K = kern.K(self.X) #I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) #alpha = np.linalg.solve(K + B_inv, self.Y) #K_B_inv = pdinv(K + B_inv)[0] #dL_dB = tdot(alpha) - K_B_inv grad_B_inv = np.diag(1. / (self.phi[:, i] + 1e-6)) grad_Lm_variance += 0.5 * np.trace(np.dot(dL_dB, grad_B_inv)) grad_Lm_variance -= .5*self.D * np.einsum('j,j->',self.phi[:, i], 1./self.variance) self.variance.gradient = grad_Lm_variance
def calculate_mu_var(self, X, Y, Z, q_u_mean, q_u_chol, kern, mean_function, num_inducing, num_data, num_outputs): """ Calculate posterior mean and variance for the latent function values for use in the expectation over the likelihood """ #expand cholesky representation L = choleskies.flat_to_triang(q_u_chol) #S = linalg.ijk_ljk_to_ilk(L, L) #L.dot(L.T) S = np.empty((num_outputs, num_inducing, num_inducing)) [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)] #logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])]) logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])]) #compute mean function stuff if mean_function is not None: prior_mean_u = mean_function.f(Z) prior_mean_f = mean_function.f(X) else: prior_mean_u = np.zeros((num_inducing, num_outputs)) prior_mean_f = np.zeros((num_data, num_outputs)) #compute kernel related stuff Kmm = kern.K(Z) #Knm = kern.K(X, Z) Kmn = kern.K(Z, X) Knn_diag = kern.Kdiag(X) #Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm) Lm = linalg.jitchol(Kmm) logdetKmm = 2.*np.sum(np.log(np.diag(Lm))) Kmmi, _ = linalg.dpotri(Lm) #compute the marginal means and variances of q(f) #A = np.dot(Knm, Kmmi) A, _ = linalg.dpotrs(Lm, Kmn) #mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u) mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u) #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S), 1) v = np.empty((num_data, num_outputs)) for i in range(num_outputs): tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0) v[:,i] = np.sum(np.square(tmp),0) v += (Knn_diag - np.sum(A*Kmn,0))[:,None] #compute the KL term Kmmim = np.dot(Kmmi, q_u_mean) #KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0) KL = KLs.sum() latent_detail = LatentFunctionDetails(q_u_mean=q_u_mean, q_u_chol=q_u_chol, mean_function=mean_function, mu=mu, v=v, prior_mean_u=prior_mean_u, L=L, A=A, S=S, Kmm=Kmm, Kmmi=Kmmi, Kmmim=Kmmim, KL=KL) return latent_detail
def grad_log_like(self, theta): ''' Function to calculate the gradient of the cost (negative log-marginal likelihood) with respect to the kernel hyperparameters Args: (array) theta: the kernel hyperparameters in the correct order Returns: (array) gradient: vector of the gradient ''' # the kernel hyperparameters theta = theta.flatten() # amplitude self.width = theta[0] # characteristic lengthscales self.scale = theta[1:] # Number of parameters n_params = len(theta) # empty array to record the gradient gradient = np.zeros(n_params) # compute alpha alpha_ = self.alpha() # compute k^-1 via triangular method kinv = gpl.dpotrs(self.chol_fact, np.eye(self.ntrain), lower=True)[0] # see expression for gradient dummy = np.einsum('i,j', alpha_.flatten(), alpha_.flatten()) - kinv # Gradient calculation with respect # to hyperparameters (hard-coded) grad = {} k_rbf = self.rbf('trainSet', self.theta_, self.theta_) grad['0'] = 2.0 * k_rbf for i in range(self.ndim): dist_ = distanceperdim(self.theta_[:, i], self.theta_[:, i]) grad[str(i + 1)] = k_rbf * dist_ / np.exp(2.0 * self.scale[i]) for i in range(n_params): gradient[i] = 0.5 * gpl.trace_dot(dummy, grad[str(i)]) return -gradient
def compute_dl_dK(posterior, K, eta, theta, prior_mean = 0): tau, v = theta, eta tau_tilde_root = np.sqrt(tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(posterior.L, np.dot(Sroot_tilde_K, v), lower=1) alpha = (v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(posterior.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T, LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) return dL_dK
def variational_q_fd(self, X, Z, q_U, p_U, kern_list, B, N, dims, d): """ Description: Returns the posterior approximation q(f) for the latent output functions (LOFs) Equation: q(f) = \int p(f|u)q(u)du Paper: In Section 2.2.2 / Variational Bounds """ Q = dims['Q'] M = dims['M'] #-----------------------------------------# POSTERIOR ALGEBRA #-------------------------------------# ####### Algebra for q(u) ####### m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] ####### Algebra for p(f_d|u) ####### Kfdu = multi_output.cross_covariance(X, Z, B, kern_list, d) Luu = p_U.Luu.copy() Kff = multi_output.function_covariance(X, B, kern_list, d) Kff_diag = np.diag(Kff) ####### Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] ####### Afdu = np.empty((Q, N, M)) # Afdu = K_{fduq}Ki_{uquq} m_fd = np.zeros((N, 1)) v_fd = np.zeros((N, 1)) S_fd = np.zeros((N, N)) v_fd += Kff_diag[:, None] S_fd += Kff for q in range(Q): ####### Expectation w.r.t. u_q part ####### R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]), Kfdu[:, q * M:(q * M) + M].T) Afdu[q, :, :] = R.T m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None]) # exp tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0) v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum( R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None] # exp S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot( Kfdu[:, q * M:(q * M) + M], R) if (v_fd < 0).any(): print('v negative!') #--------------------------------------# VARIATIONAL POSTERIOR (LOFs) #-----------------------------------# ####### Variational output distribution q_fd() ####### q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd) return q_fd
def woodbury_chol(self): """ return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix $$ L_{W}L_{W}^{\top} = W^{-1} W^{-1} := \texttt{Woodbury inv} $$ """ if self._woodbury_chol is None: #compute woodbury chol from if self._woodbury_inv is not None: winv = np.atleast_3d(self._woodbury_inv) self._woodbury_chol = np.zeros(winv.shape) for p in range(winv.shape[-1]): self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2] elif self._covariance is not None: raise NotImplementedError("TODO: check code here") B = self._K - self._covariance tmp, _ = dpotrs(self.K_chol, B) self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T) _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv) else: raise ValueError("insufficient information to compute posterior") return self._woodbury_chol
def _inference( K: np.ndarray, ga_approx: GaussianApproximation, cav_params: CavityParams, Z_tilde: float, y: List[Tuple[int, float]], yc: List[List[Tuple[int, int]]], ) -> Tuple[Posterior, int, Dict]: """ Compute the posterior approximation :param K: prior covariance matrix :param ga_approx: Gaussian approximation of the batches :param cav_params: Cavity parameters of the posterior :param Z_tilde: Log marginal likelihood :param y: Direct observations as a list of tuples telling location index (row in X) and observation value. :param yc: Batch comparisons in a list of lists of tuples. Each batch is a list and tuples tell the comparisons (winner index, loser index) :return: A tuple consisting of the posterior approximation, log marginal likelihood and gradient dictionary """ log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde, y, yc) tau_tilde_root = sqrtm_block(ga_approx.tau, y, yc) Sroot_tilde_K = np.dot(tau_tilde_root, K) aux_alpha, _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - np.dot(tau_tilde_root, aux_alpha))[:, None] # (K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, tau_tilde_root, lower=1) Wi = np.dot(LWi.T, LWi) symmetrify(Wi) # (K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 return ( Posterior(woodbury_inv=np.asfortranarray(Wi), woodbury_vector=alpha, K=K), log_marginal, { "dL_dK": dL_dK, "dL_dthetaL": dL_dthetaL, "dL_dm": alpha }, )
def prediction(self, testpoint, returnvar=True): ''' Function to make predictions given a test point Args: (array) testpoint: a test point of length ndim (bool) returnvar: If True, the GP variance will be computed Returns: (array) mean, var: if returnvar=True (array) mean : if returnvar=False ''' # use numpy array instead of list (if any) testpoint = np.array(testpoint).flatten() assert len(testpoint) == self.ndim, 'different dimension' # transform point first testpoint_trans = np.dot(self.mu_matrix, testpoint) testpoint_trans = testpoint_trans.reshape(1, self.ndim) # compute the k_star vector k_s = self.kernel('trainTest', self.theta_, testpoint_trans) # compute mean GP - super quick mean_gp = np.array([(k_s.flatten() * self.alpha_.flatten()).sum(0)]) # rescale back mean_scaled = self.mean_y + self.std_y * mean_gp # do extra computations if we want GP variance if returnvar: variance = gpl.dpotrs(self.chol_fact, k_s, lower=True)[0].flatten() k_ss = self.kernel('testSet', testpoint_trans, testpoint_trans) var_gp = k_ss - (k_s.flatten() * variance).sum(0) var_gp = var_gp.flatten() # rescale back var = self.std_y**2 * var_gp return mean_scaled, var return mean_scaled
def prediction(self, testPoint, returnVar=True): ''' Function to make predictions given a test point Args: (array) testPoint: a test point of length ndim (bool) returnVar: If True, the GP variance will be computed Returns: (array) mean, var: if returnVar=True (array) mean : if returnVar=False ''' # use numpy array instead of list (if any) testPoint = np.array(testPoint).flatten() assert len(testPoint) == self.ndim, 'Different dimension' # transform point first testPoint_trans = np.dot(self.MU, testPoint) testPoint_trans = testPoint_trans.reshape(1, self.d) # compute the k_star vector ks = self.kernel('trainTest', self.theta_, testPoint_trans) # compute mean GP - super quick meanGP = np.array([(ks.flatten() * self.alpha_.flatten()).sum(0)]) # rescale back mu = self.mean_y + self.std_y * meanGP # do extra computations if we want GP variance if returnVar: v = gpl.dpotrs(self.L, ks, lower=True)[0].flatten() kss = self.kernel('testSet', testPoint_trans, testPoint_trans) varGP = kss - (ks.flatten() * v).sum(0) varGP = varGP.flatten() # rescale back var = self.std_y**2 * varGP return mu, var else: return mu
def _inference(K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None): log_marginal, post_params = _ep_marginal(K, ga_approx, Z_tilde) tau_tilde_root = np.sqrt(ga_approx.tau) Sroot_tilde_K = tau_tilde_root[:,None] * K aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1) alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde) LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1) Wi = np.dot(LWi.T,LWi) symmetrify(Wi) #(K + Sigma^(\tilde))^(-1) dL_dK = 0.5 * (tdot(alpha) - Wi) dL_dthetaL = 0 #likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh') #temp2 = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='naive') #temp = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata = Y_metadata) #print("exact: {}, approx: {}, Ztilde: {}, naive: {}".format(temp, dL_dthetaL, Z_tilde, temp2)) return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
def _log_likelihood(self, log_params): # Returns log likelihood, p(D|hyperparams) params = np.exp(log_params) l_scales = params[0:self.X_dim] output_var = params[ self. X_dim] # QUESTION: difference between output and noise variance noise_var = params[self.X_dim + 1] # compute eta eta = np.min( self.Y) - params[self.X_dim + 2] # QUESTION: what is this? # compute the observed value for g instead of y g_ob = np.sqrt(2.0 * (self.Y - eta)) kernel = GPy.kern.RBF(input_dim=self.X_dim, ARD=True, variance=output_var, lengthscale=l_scales) Kng = kernel.K(self.X) # QUESTION: does not seem to follow conditional variance form in eqn 6 # compute posterior mean distribution for g TODO update this # GPg = GPy.models.GPRegression(self.X, g_ob, kernel, noise_var=1e-8) # mg,_ = GPg.predict(self.X) mg = g_ob # approximate covariance matrix of y using linearisation technique Kny = mg * Kng * mg.T + (noise_var + 1e-8) * np.eye(Kng.shape[0]) # compute likelihood terms Wi, LW, LWi, W_logdet = pdinv(Kny) # from GPy module # Wi = inverse of Kny (ndarray) # LW = Cholesky decomposition of Kny (ndarray) # LWi = Cholesky decomposition of inverse of Kny (ndarray) # W_logdet = log determinant of Kny (float) alpha, _ = dpotrs(LW, self.Y, lower=1) loglikelihood = 0.5 * (-self.Y.size * np.log(2 * np.pi) - self.Y.shape[1] * W_logdet - np.sum(alpha * self.Y)) # Log marginal likelihood for GP, based on Rasmussen eqn 2.30 return loglikelihood
def calculate_q_f(self, X, Z, q_U, p_U, kern_list, B, M, N, Q, D, d): """ Calculates the mean and variance of q(f_d) as Equation: E_q(U)\{p(f_d|U)\} """ # Algebra for q(u): m_u = q_U.mu_u.copy() L_u = choleskies.flat_to_triang(q_U.chols_u.copy()) S_u = np.empty((Q, M, M)) [np.dot(L_u[q, :, :], L_u[q, :, :].T, S_u[q, :, :]) for q in range(Q)] # Algebra for p(f_d|u): Kfdu = util.cross_covariance(X, Z, B, kern_list, d) Kuu = p_U.Kuu.copy() Luu = p_U.Luu.copy() Kuui = p_U.Kuui.copy() Kff = util.function_covariance(X, B, kern_list, d) Kff_diag = np.diag(Kff) # Algebra for q(f_d) = E_{q(u)}[p(f_d|u)] Afdu = np.empty((Q, N, M)) #Afdu = K_{fduq}Ki_{uquq} m_fd = np.zeros((N, 1)) v_fd = np.zeros((N, 1)) S_fd = np.zeros((N, N)) v_fd += Kff_diag[:, None] S_fd += Kff for q in range(Q): # Expectation part R, _ = linalg.dpotrs(np.asfortranarray(Luu[q, :, :]), Kfdu[:, q * M:(q * M) + M].T) Afdu[q, :, :] = R.T m_fd += np.dot(Afdu[q, :, :], m_u[:, q, None]) #exp tmp = dtrmm(alpha=1.0, a=L_u[q, :, :].T, b=R, lower=0, trans_a=0) v_fd += np.sum(np.square(tmp), 0)[:, None] - np.sum( R * Kfdu[:, q * M:(q * M) + M].T, 0)[:, None] #exp S_fd += np.dot(np.dot(R.T, S_u[q, :, :]), R) - np.dot( Kfdu[:, q * M:(q * M) + M], R) if (v_fd < 0).any(): print('v negative!') q_fd = qfd(m_fd=m_fd, v_fd=v_fd, Kfdu=Kfdu, Afdu=Afdu, S_fd=S_fd) return q_fd
def alpha(self): ''' Function to compute alpha = k^-1 y Args: None Returns: (array) alpha of size N x 1 ''' # compute the kernel matrix of size N x N k = self.kernel('trainSet', self.theta_, self.theta_) # compute the Cholesky factor self.chol_fact = gpl.jitchol(k) # Use triangular method to solve for alpha alp = gpl.dpotrs(self.chol_fact, self.output, lower=True)[0] return alp
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) K_B_inv, L_B, _, _ = pdinv(K + B_inv) alpha, _ = dpotrs(L_B, self.Y) dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv) grad_Lm[:,i] = -0.5 * self.variance * dL_dB_diag / (self.phi[:,i]**2 + 1e-6) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def vb_grad_natgrad(self): """ Natural Gradients of the bound with respect to phi, the variational parameters controlling assignment of the data to GPs """ grad_Lm = np.zeros_like(self.phi) for i, kern in enumerate(self.kern): K = kern.K(self.X) I = np.eye(self.N) B_inv = np.diag(1. / ((self.phi[:, i] + 1e-6) / self.variance)) K_B_inv, L_B, _, _ = pdinv(K + B_inv) alpha, _ = dpotrs(L_B, self.Y) dL_dB_diag = np.sum(np.square(alpha), 1) - np.diag(K_B_inv) grad_Lm[:, i] = -0.5 * self.variance * dL_dB_diag / ( self.phi[:, i]**2 + 1e-6) grad_phi = grad_Lm + self.mixing_prop_bound_grad() + self.Hgrad natgrad = grad_phi - np.sum(self.phi * grad_phi, 1)[:, None] grad = natgrad * self.phi return grad.flatten(), natgrad.flatten()
def loglikelihood(self, parameters): self.assign_parameters(parameters) cosmo_ = self.cosmo_params.values() a_ia = self.systematics['A_IA'] a_bary = self.systematics['A_bary'] # careful here - we have to supply sum of neutrinos, that is, # parameters[-1], not 'm_ncdm' neut = parameters[-1] testpoint = np.concatenate([ list(cosmo_), np.ones(1) * a_bary, np.ones(1) * neut, np.ones(1) * a_ia ]) index_ee = self.all_bands_ee_to_use == 1 index_bb = self.all_bands_bb_to_use == 1 if self.set_random: if self.n_realisation == 1: cl_ee_total = self.random_sample(testpoint).flatten() else: cl_ee_total = self.random_sample(testpoint) else: cl_ee_total = self.mean_prediction(testpoint) param_name = 'm_corr' if param_name in self.settings.use_nuisance: m_m, m_c = self.calc_m_correction() covariance = self.covariance / np.asarray(m_c) covariance = self.covariance[np.ix_(self.indices_for_bands_to_use, self.indices_for_bands_to_use)] band_powers = self.band_powers / np.asarray(m_m) band_powers = self.band_powers[self.indices_for_bands_to_use] else: band_powers = self.band_powers covariance = self.covariance cl_sys_bb, cl_sys_ee_noise, cl_sys_bb_noise = self.systematics_calc() theory_ee = cl_ee_total + cl_sys_ee_noise[index_ee] theory_bb = cl_sys_bb[index_bb] + cl_sys_bb_noise[index_bb] if (self.set_random and self.n_realisation > 1): theory_bb_nr = np.repeat(theory_bb.reshape(1, len(theory_bb)), self.n_realisation, axis=0) band_powers_theory = np.concatenate((theory_ee, theory_bb_nr), axis=1) difference_vector = band_powers_theory - band_powers else: band_powers_theory = np.concatenate((theory_ee, theory_bb)) difference_vector = band_powers_theory - band_powers if np.isinf(band_powers_theory).any() or np.isnan( band_powers_theory).any(): return -1E32 elif param_name in self.settings.use_nuisance: # use a Cholesky decomposition instead: chol_fact = cholesky(covariance, lower=True) if (self.set_random and self.n_realisation > 1): cinv = gpl.dpotrs(chol_fact, np.eye(chol_fact.shape[0]), lower=True)[0] cinv_diff = np.dot(cinv, difference_vector.T) chi2 = np.einsum('ij,ij->j', difference_vector.T, cinv_diff) return logsumexp(-0.5 * chi2) - np.log(self.n_realisation) else: yt = solve_triangular(chol_fact, difference_vector.T, lower=True) chi2 = yt.dot(yt) return -0.5 * chi2
def incremental_inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None): # do incremental update if mean_function is None: m = 0 else: m = mean_function.f(X) if variance is None: variance = likelihood.gaussian_variance(Y_metadata) YYT_factor = Y - m # K_tmp = kern.K(X, X[-1:]) K_inc = kern._K[:-1, -1] K_inc2 = kern._K[-1:, -1] # self._K = np.block([[self._K, K_inc], [K_inc.T, K_inc2]]) # Ky = K.copy() jitter = variance[ -1] + 1e-8 # variance can be given for each point individually, in which case we just take the last point # diag.add(Ky, jitter) # LW_old = self._old_posterior.woodbury_chol Wi, LW, LWi, W_logdet = pdinv_inc(self._old_LW, K_inc, K_inc2 + jitter, self._old_Wi) alpha, _ = dpotrs(LW, YYT_factor, lower=1) log_marginal = 0.5 * (-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor)) if Z_tilde is not None: # This is a correction term for the log marginal likelihood # In EP this is log Z_tilde, which is the difference between the # Gaussian marginal and Z_EP log_marginal += Z_tilde dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi) dL_dthetaL = likelihood.exact_inference_gradients( np.diag(dL_dK), Y_metadata) self._old_LW = LW self._old_Wi = Wi posterior = Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K) # TODO add logdet to posterior ? return posterior, log_marginal, { 'dL_dK': dL_dK, 'dL_dthetaL': dL_dthetaL, 'dL_dm': alpha }
def bifurcation_statistics(omgp_gene, expression_matrix): ''' Given an OMGP model and an expression matrix, evaluate how well every gene fits the model. ''' bif_stats = pd.DataFrame(index=expression_matrix.index) bif_stats['bif_ll'] = np.nan bif_stats['amb_ll'] = np.nan bif_stats['shuff_bif_ll'] = np.nan bif_stats['shuff_amb_ll'] = np.nan # Make a "copy" of provided OMGP but assign ambiguous mixture parameters omgp_gene_a = OMGP(omgp_gene.X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_a.phi = np.ones_like(omgp_gene.phi) * 1. / omgp_gene.K # To control FDR, perform the same likelihood calculation, but with permuted X values shuff_X = np.array(omgp_gene.X).copy() np.random.shuffle(shuff_X) omgp_gene_shuff = OMGP(shuff_X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_shuff.phi = omgp_gene.phi omgp_gene_shuff_a = OMGP(shuff_X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_shuff_a.phi = np.ones_like(omgp_gene.phi) * 1. / omgp_gene.K # Precalculate response-variable independent parts omgps = [omgp_gene, omgp_gene_a, omgp_gene_shuff, omgp_gene_shuff_a] column_list = ['bif_ll', 'amb_ll', 'shuff_bif_ll', 'shuff_amb_ll'] precalcs = [omgp_model_bound(omgp) for omgp in omgps] # Calculate the likelihoods of the models for every gene for gene in tqdm(expression_matrix.index): Y = expression_matrix.ix[gene] YYT = np.outer(Y, Y) for precalc, column in zip(precalcs, column_list): model_bound, LBs = precalc GP_data_fit = 0. for LB in LBs: GP_data_fit -= .5 * dpotrs(LB, YYT)[0].trace() bif_stats.ix[gene, column] = model_bound + GP_data_fit bif_stats['phi0_corr'] = expression_matrix.corrwith( pd.Series(omgp_gene.phi[:, 0], index=expression_matrix.columns), 1) bif_stats['D'] = bif_stats['bif_ll'] - bif_stats['amb_ll'] bif_stats[ 'shuff_D'] = bif_stats['shuff_bif_ll'] - bif_stats['shuff_amb_ll'] return bif_stats
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, fixed_covs_kerns=None, **kw): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = self.get_YYTfactor(Y) #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta) het_noise = beta.size > 1 if het_noise: raise(NotImplementedError("Heteroscedastic noise not implemented, should be possible though, feel free to try implementing it :)")) if beta.ndim == 1: beta = beta[:, None] # do the inference: num_inducing = Z.shape[0] num_data = Y.shape[0] # kernel computations, using BGPLVM notation Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) if Lm is None: Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: psi0 = kern.psi0(Z, X) psi1 = kern.psi1(Z, X) if het_noise: psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) if het_noise: tmp = psi1 * (np.sqrt(beta)) else: tmp = psi1 * (np.sqrt(beta)) tmp, _ = dtrtrs(Lm, tmp.T, lower=1) A = tdot(tmp) # factor B B = np.eye(num_inducing) + A LB = jitchol(B) # back substutue C into psi1Vf #tmp, _ = dtrtrs(Lm, psi1.T.dot(VVT_factor), lower=1, trans=0) #_LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0) #tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1) #Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # data fit and derivative of L w.r.t. Kmm #delit = tdot(_LBi_Lmi_psi1Vf) # Expose YYT to get additional covariates in (YYT + Kgg): tmp, _ = dtrtrs(Lm, psi1.T, lower=1, trans=0) _LBi_Lmi_psi1, _ = dtrtrs(LB, tmp, lower=1, trans=0) tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1, lower=1, trans=1) Cpsi1, _ = dtrtrs(Lm, tmp, lower=1, trans=1) # TODO: cache this: # Compute fixed covariates covariance: if fixed_covs_kerns is not None: K_fixed = 0 for name, [cov, k] in fixed_covs_kerns.iteritems(): K_fixed += k.K(cov) #trYYT = self.get_trYYT(Y) YYT_covs = (tdot(Y) + K_fixed) data_term = beta**2 * YYT_covs trYYT_covs = np.trace(YYT_covs) else: data_term = beta**2 * tdot(Y) trYYT_covs = self.get_trYYT(Y) #trYYT = self.get_trYYT(Y) delit = mdot(_LBi_Lmi_psi1, data_term, _LBi_Lmi_psi1.T) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) if dL_dKmm is None: delit = -0.5 * DBi_plus_BiPBi delit += -0.5 * B * output_dim delit += output_dim * np.eye(num_inducing) # Compute dL_dKmm dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, data_term, Cpsi1, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs) # log marginal likelihood log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT_covs, data_fit, Y) if self.save_per_dim: self.saved_vals = [psi0, A, LB, _LBi_Lmi_psi1, beta] # No heteroscedastics, so no _LBi_Lmi_psi1Vf: # For the interested reader, try implementing the heteroscedastic version, it should be possible _LBi_Lmi_psi1Vf = None # Is just here for documentation, so you can see, what it was. #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT_covs, Y, None) dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2, 'dL_dthetaL':dL_dthetaL} else: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} if fixed_covs_kerns is not None: # For now, we do not take the gradients, we can compute them, # but the maximum likelihood solution is to switch off the additional covariates.... dL_dcovs = beta * np.eye(K_fixed.shape[0]) - beta**2*tdot(_LBi_Lmi_psi1.T) grad_dict['dL_dcovs'] = -.5 * dL_dcovs #get sufficient things for posterior prediction #TODO: do we really want to do this in the loop? if 1: woodbury_vector = (beta*Cpsi1).dot(Y) else: import ipdb; ipdb.set_trace() psi1V = np.dot(Y.T*beta, psi1).T tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0) tmp, _ = dpotrs(LB, tmp, lower=1) woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1) Bi, _ = dpotri(LB, lower=1) symmetrify(Bi) Bi = -dpotri(LB, lower=1)[0] diag.add(Bi, 1) woodbury_inv = backsub_both_sides(Lm, Bi) #construct a posterior object post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm) return post, log_marginal, grad_dict
def bifurcation_statistics(omgp_gene, expression_matrix): ''' Given an OMGP model and an expression matrix, evaluate how well every gene fits the model. ''' bif_stats = pd.DataFrame(index=expression_matrix.index) bif_stats['bif_ll'] = np.nan bif_stats['amb_ll'] = np.nan bif_stats['shuff_bif_ll'] = np.nan bif_stats['shuff_amb_ll'] = np.nan # Make a "copy" of provided OMGP but assign ambiguous mixture parameters omgp_gene_a = OMGP(omgp_gene.X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_a.phi = np.ones_like(omgp_gene.phi) * 1. / omgp_gene.K # To control FDR, perform the same likelihood calculation, but with permuted X values shuff_X = np.array(omgp_gene.X).copy() np.random.shuffle(shuff_X) omgp_gene_shuff = OMGP(shuff_X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_shuff.phi = omgp_gene.phi omgp_gene_shuff_a = OMGP(shuff_X, omgp_gene.Y, K=omgp_gene.K, kernels=[k.copy() for k in omgp_gene.kern], prior_Z=omgp_gene.prior_Z, variance=float(omgp_gene.variance)) omgp_gene_shuff_a.phi = np.ones_like(omgp_gene.phi) * 1. / omgp_gene.K # Precalculate response-variable independent parts omgps = [omgp_gene, omgp_gene_a, omgp_gene_shuff, omgp_gene_shuff_a] column_list = ['bif_ll', 'amb_ll', 'shuff_bif_ll', 'shuff_amb_ll'] precalcs = [omgp_model_bound(omgp) for omgp in omgps] # Calculate the likelihoods of the models for every gene for gene in tqdm(expression_matrix.index): Y = expression_matrix.ix[gene] YYT = np.outer(Y, Y) for precalc, column in zip(precalcs, column_list): model_bound, LBs = precalc GP_data_fit = 0. for LB in LBs: GP_data_fit -= .5 * dpotrs(LB, YYT)[0].trace() bif_stats.ix[gene, column] = model_bound + GP_data_fit bif_stats['phi0_corr'] = expression_matrix.corrwith(pd.Series(omgp_gene.phi[:, 0], index=expression_matrix.columns), 1) bif_stats['D'] = bif_stats['bif_ll'] - bif_stats['amb_ll'] bif_stats['shuff_D'] = bif_stats['shuff_bif_ll'] - bif_stats['shuff_amb_ll'] return bif_stats