def getCovariances(self, logtheta): """[L,Alpha] = getCovariances() - special overwritten version of getCovariance (gpr.py) - here: EP updates are employed""" if (logtheta == self.logtheta).all() and (self.cached_L is not None): return [self.cached_L, self.cached_alpha] #1. copy logtheta self.logtheta = logtheta.copy() assert (self.Nlogtheta) == logtheta.shape[0], "incorrect shape of kernel parameter matrix" #2. vanilla Kernel matrix K = self.covar.K(logtheta[self.IlogthetaK], self.x) #3. run EP updates #EP effectively creates a new Kernel matrix (with input dependent noise) and new effective training means #in addition we store a 0th moment which is used for the lMl calculation self.updateEP(K, logtheta[self.IlogthetaL]) #updateEP computes the site parameters which we use here to calcualte the full covarince for test predictions Keff = (K + SP.diag(self.vEP)) self.cached_L = linalg.cholesky(Keff) self.cached_alpha = solve_chol(self.cached_L.transpose(), self.muEP) return [self.cached_L, self.cached_alpha]
def find_mode_newton(self, return_full=False): """ Newton search for mode of p(y|f)p(f) from GP book, algorithm 3.1, added step size """ K = self.gp.K if self.newton_start is None: f = zeros(len(K)) else: f = self.newton_start if return_full: steps = [f] iteration = 0 norm_difference = inf objective_value = -inf while iteration < self.newton_max_iterations and norm_difference > self.newton_epsilon: # from GP book, algorithm 3.1, added step size # scale log_lik_grad_vector and K^-1 f = a w = -self.gp.likelihood.log_lik_hessian_vector(self.gp.y, f) w_sqrt = sqrt(w) # diag(w_sqrt).dot(K.dot(diag(w_sqrt))) == (K.T*w_sqrt).T*w_sqrt L = cholesky(eye(len(K)) + (K.T * w_sqrt).T * w_sqrt) b = f * w + self.newton_step * self.gp.likelihood.log_lik_grad_vector(self.gp.y, f) # a=b-diag(w_sqrt).dot(inv(eye(len(K)) + (K.T*w_sqrt).T*w_sqrt).dot(diag(w_sqrt).dot(K.dot(b)))) a = w_sqrt * (K.dot(b)) a = solve_triangular(L, a, lower=True) a = solve_triangular(L.T, a, lower=False) a = w_sqrt * a a = b - a f_new = K.dot(self.newton_step * a) # convergence stuff and next iteration objective_value_new = -0.5 * a.T.dot(f) + sum(self.gp.likelihood.log_lik_vector(self.gp.y, f)) norm_difference = norm(f - f_new) if objective_value_new > objective_value: f = f_new if return_full: steps.append(f) else: self.newton_step /= 2 iteration += 1 objective_value = objective_value_new self.computed = True if return_full: return f, L, asarray(steps) else: return f
def sample_conditional(self, index): if index < 0 or index >= self.dimension: raise ValueError("Conditional index out of bounds") # all indices but the current cond_inds = hstack((arange(0, index), arange(index + 1, self.dimension))) # print "conditioning on index %d" % index # print "other indices:", cond_inds # partition the Gaussian x|y, precompute matrix inversion mu_x = self.full_target.mu[index] Sigma_xx = self.full_Sigma[index, index] mu_y = self.full_target.mu[cond_inds] Sigma_yy = self.full_Sigma[cond_inds, cond_inds].reshape(len(cond_inds), len(cond_inds)) L_yy = cholesky(Sigma_yy) Sigma_xy = self.full_Sigma[index, cond_inds] Sigma_yx = self.full_Sigma[cond_inds, index] y = self.current_state[cond_inds] # mu=mu_x+Sigma_xy Sigma_yy^(-1)(y-mu_y) mu = mu_x + Sigma_xy.dot(MatrixTools.cholesky_solve(L_yy, y - mu_y)) # solve Sigma=Sigma_xx-Sigma_yy^-1 Sigma_yx=Sigma_xy-Sigma_xy L_yy^(-T)_yy^(-1) Sigma_yx Sigma = Sigma_xx - Sigma_xy.dot(MatrixTools.cholesky_solve(L_yy, Sigma_yx)) # return sample from x|y conditional_sample = randn() * sqrt(Sigma) + mu return conditional_sample
def __init__(self, mu=asarray([0, 0]), Sigma=eye(2), is_cholesky=False): DensityFunction.__init__(self, len(Sigma)) assert(len(shape(mu)) == 1) assert(max(shape(Sigma)) == len(mu)) self.mu = mu if is_cholesky: self.L = Sigma else: assert(shape(Sigma)[0] == shape(Sigma)[1]) self.L = cholesky(Sigma)
def predict(self, X_test, f_mode=None): """ Predictions for GP with Laplace approximation. from GP book, algorithm 3.2, """ if f_mode is None: f_mode = self.find_mode_newton() predictions = zeros(len(X_test)) K = self.gp.K K_train_test = self.gp.covariance.compute(self.gp.X, X_test) w = -self.gp.likelihood.log_lik_hessian_vector(self.gp.y, f_mode) w_sqrt = sqrt(w) # diag(w_sqrt).dot(K.dot(diag(w_sqrt))) == (K.T*w_sqrt).T*w_sqrt L = cholesky(eye(len(K)) + (K.T * w_sqrt).T * w_sqrt) # iterator for all testing points for i in range(len(X_test)): k = K_train_test[:, i] k_self = self.gp.covariance.compute([X_test[i]], [X_test[i]])[0] f_mean = k.dot( self.gp.likelihood.log_lik_grad_vector(self.gp.y, f_mode)) v = solve_triangular(L, w_sqrt * k, lower=True) f_var = k_self - v.T.dot(v) predictions[i] = integrate.quad( lambda x: norm.pdf(x, f_mean, f_var), -inf, inf)[0] # # integrate over Gaussian using some crude numerical integration # samples=randn(1000)*sqrt(f_var) + f_mean # # log_liks=self.gp.likelihood.log_lik_vector(1.0, samples) # predictions[i]=1.0/len(samples)*GPTools.log_sum_exp(log_liks) return predictions
def predict(self, X_test, f_mode=None): """ Predictions for GP with Laplace approximation. from GP book, algorithm 3.2, """ if f_mode is None: f_mode = self.find_mode_newton() predictions = zeros(len(X_test)) K = self.gp.K K_train_test = self.gp.covariance.compute(self.gp.X, X_test) w = -self.gp.likelihood.log_lik_hessian_vector(self.gp.y, f_mode) w_sqrt = sqrt(w) # diag(w_sqrt).dot(K.dot(diag(w_sqrt))) == (K.T*w_sqrt).T*w_sqrt L = cholesky(eye(len(K)) + (K.T * w_sqrt).T * w_sqrt) # iterator for all testing points for i in range(len(X_test)): k = K_train_test[:, i] k_self = self.gp.covariance.compute([X_test[i]], [X_test[i]])[0] f_mean = k.dot(self.gp.likelihood.log_lik_grad_vector(self.gp.y, f_mode)) v = solve_triangular(L, w_sqrt * k, lower=True) f_var = k_self - v.T.dot(v) predictions[i] = integrate.quad(lambda x: norm.pdf(x, f_mean, f_var), -inf, inf)[0] # # integrate over Gaussian using some crude numerical integration # samples=randn(1000)*sqrt(f_var) + f_mean # # log_liks=self.gp.likelihood.log_lik_vector(1.0, samples) # predictions[i]=1.0/len(samples)*GPTools.log_sum_exp(log_liks) return predictions
def sigma_points(xm, P, kappa): """ Calculate the Sigma Points of an unscented Kalman filter Mark Wickert December 2017 Translated P. Kim's program from m-code """ n = xm.size Xi = np.zeros((n, 2 * n + 1)) # sigma points = col of Xi W = np.zeros(2 * n + 1) Xi[:, 0, None] = xm W[0] = kappa / (n + kappa) U = cholesky((n + kappa) * P) # U'*U = (n+kappa)*P for k in range(n): Xi[:, k + 1, None] = xm + U[k, None, :].T # row of U W[k + 1] = 1 / (2 * (n + kappa)) for k in range(n): Xi[:, n + k + 1, None] = xm - U[k, None, :].T W[n + k + 1] = 1 / (2 * (n + kappa)) return Xi, W
exit() experiment_dir_base = str(sys.argv[1]) n = int(str(sys.argv[2])) # loop over parameters here experiment_dir = experiment_dir_base + str(os.path.abspath(sys.argv[0])).split(os.sep)[-1].split(".")[0] + os.sep print "running experiments", n, "times at base", experiment_dir # load data data,labels=GPData.get_glass_data() # normalise and whiten dataset data-=mean(data, 0) L=cholesky(cov(data.T)) data=solve_triangular(L, data.T, lower=True).T dim=shape(data)[1] # prior on theta and posterior target estimate theta_prior=Gaussian(mu=0*ones(dim), Sigma=eye(dim)*5) distribution=PseudoMarginalHyperparameterDistribution(data, labels, \ n_importance=100, prior=theta_prior, \ ridge=1e-3) sigma = 23.0 print "using sigma", sigma kernel = GaussianKernel(sigma=sigma) for i in range(n):
savetxt(graphlab_lines.add_edge(node, out_message,"R_t"), Rt) savetxt(graphlab_lines.add_edge(node, out_message,"P_t"), Pt) savetxt(graphlab_lines.add_edge(node, out_message,"W"), Ws.dot(Wt.T)) print "precomputing systems for messages from non-observed nodes" graphlab_lines.lines.append(os.linesep + "# edges with non-observed targets") for edge in edges: # exclude edges which involve observed nodes is_edge_target_observed=len(Set(observations.keys()).intersection(Set(edge)))>0 if not is_edge_target_observed: graphlab_lines.new_edge_observed_target(edge[1], edge[0]) data_source=data[edge][0] Ks=kernel.kernel(data_source) Ls=cholesky(Ks+eye(shape(Ks)[0])*reg_lambda) Ls_filename=graphlab_lines.add_edge(edge[1], edge[0],"L_s") # print Ls_filename savetxt(Ls_filename, Ls) print "precomputing (non-symmetric) kernels for incoming messages at a node" graphlab_lines.lines.append("# non-observed nodes") for node in graph: added_node=False for in_message in graph[node]: for out_message in graph[node]: if in_message==out_message: continue
if __name__ == '__main__': # load data data, labels = GPData.get_glass_data() # throw away some data n = 250 seed(1) idx = permutation(len(data)) idx = idx[:n] data = data[idx] labels = labels[idx] # normalise and whiten dataset data -= mean(data, 0) L = cholesky(cov(data.T)) data = solve_triangular(L, data.T, lower=True).T dim = shape(data)[1] # prior on theta and posterior target estimate theta_prior = Gaussian(mu=0 * ones(dim), Sigma=eye(dim) * 5) target=PseudoMarginalHyperparameterDistribution(data, labels, \ n_importance=100, prior=theta_prior, \ ridge=1e-3) # create sampler burnin = 10000 num_iterations = burnin + 300000 kernel = GaussianKernel(sigma=23.0) sampler = KameleonWindowLearnScale(target, kernel, stop_adapt=burnin) # sampler=AdaptiveMetropolisLearnScale(target)
def find_mode_newton(self, return_full=False): """ Newton search for mode of p(y|f)p(f) from GP book, algorithm 3.1, added step size """ K = self.gp.K if self.newton_start is None: f = zeros(len(K)) else: f = self.newton_start if return_full: steps = [f] iteration = 0 norm_difference = inf objective_value = -inf while iteration < self.newton_max_iterations and norm_difference > self.newton_epsilon: # from GP book, algorithm 3.1, added step size # scale log_lik_grad_vector and K^-1 f = a w = -self.gp.likelihood.log_lik_hessian_vector(self.gp.y, f) w_sqrt = sqrt(w) # diag(w_sqrt).dot(K.dot(diag(w_sqrt))) == (K.T*w_sqrt).T*w_sqrt L = cholesky(eye(len(K)) + (K.T * w_sqrt).T * w_sqrt) b = f * w + self.newton_step * \ self.gp.likelihood.log_lik_grad_vector(self.gp.y, f) # a=b-diag(w_sqrt).dot(inv(eye(len(K)) + (K.T*w_sqrt).T*w_sqrt).dot(diag(w_sqrt).dot(K.dot(b)))) a = (w_sqrt * (K.dot(b))) a = solve_triangular(L, a, lower=True) a = solve_triangular(L.T, a, lower=False) a = w_sqrt * a a = b - a f_new = K.dot(self.newton_step * a) # convergence stuff and next iteration objective_value_new = -0.5 * a.T.dot(f) + \ sum(self.gp.likelihood.log_lik_vector(self.gp.y, f)) norm_difference = norm(f - f_new) if objective_value_new > objective_value: f = f_new if return_full: steps.append(f) else: self.newton_step /= 2 iteration += 1 objective_value = objective_value_new self.computed = True if return_full: return f, L, asarray(steps) else: return f
def updateEP(self, K, logthetaL=None): """update a kernel matrix K using Ep approximation [K,t,C0] = updateEP(K,logthetaL) logthetaL: likelihood hyperparameters t: new means of training targets K: new effective kernel matrix C0:0th moments """ assert K.shape[0] == K.shape[1], "Kernel matrix must be square" assert K.shape[0] == self.n, "Kernel matrix has wrong dimension" #approximate site parmeters; 3 moments # note g is in natural parameter representation (1,2) g = SP.zeros([self.n, 2]) # a copy for damping g2 = SP.zeros([self.n, 2]) # 0. moment is just catptured in z z = SP.zeros([self.n]) # damping factors damp = SP.ones([self.n]) #approx is #p(f) = N(f|mu,Sigma) # where Sigma = (K^{-1} + PI^{-1})^{-1}; PI is created from teh diaginal # entries in g; PI = diag(Var(g)) # mu = Sigma*PI^{-1}Mean(g) # where \mu is form the site parameters in g also #add some gitter to make it invertible K += SP.eye(K.shape[0]) * 1E-6 #initialize current approx. of full covariance Sigma = K.copy() #invert Kernel matrix; which is used later on #TODO: replace by chol KI = linalg.inv(K) #current approx. mean mu = SP.zeros([self.n]) #conversion nat. parameter/moment representation n2mode = lambda x: SP.array([x[0] / x[1], 1 / x[1]]) #set hyperparameter of likelihood object self.likelihood.setLogtheta(logthetaL) for nep in range(self.Nep): #get order of site function update perm = SP.random.permutation(self.n) perm = SP.arange(self.n) for ni in perm: #cavity as natural parameter representation cav_np = n2mode([mu[ni], Sigma[ni, ni]]) - g[ni] #ensure we don't have negative variances. good idea? cav_np[1] = abs(cav_np[1]) #calculate expectation values (int_, int_y,int_y^2) ML = self.likelihood.calcExpectations(self.t[ni], cav_np, x=self.x[ni]) #1. and 2. moment can be back-calculated to enw site parameters #update the site parameters; #in natural parameters this is just deviding out the site function; v. convenient gn = n2mode(ML[0:2]) - cav_np #delta gn in nat. parameters dg = gn - g[ni] #difference of second moment (old-new) ds2 = gn[1] - g[ni, 1] #update with damping factor damp[ni] g[ni] = g[ni] + damp[ni] * dg if(g[ni, 1] < 0): g[ni, 1] = 1E-10 z[ni] = ML[2] if 1: #rank one updates Sigma2 = Sigma Sigma = Sigma - ds2 / (1 + ds2 * Sigma[ni, ni]) * SP.outer(Sigma[:, ni], Sigma[ni, :]) if 1: #check that Sigma is still pos. definite, otherweise we need to to do some damping... try: Csigma = linalg.cholesky(Sigma) #except linalg.linalg.LinAlgError: except LinAlgError: logging.debug('damping') Sigma = Sigma2 g[ni] = g2[ni] #increase damping factor damp[ni] *= 0.9 pass #update mu; mu[i] = Sigma[i,i]*(1/Var(g[i]))*Mean(g[i]) #as go is in nat. parameter this is always like this mu = SP.dot(Sigma, g[:, 0]) else: #slow updates Sigma = linalg.inv(KI + SP.diag(g[:, 1])); mu = SP.dot(Sigma, g[:, 0]) pass #after every sweep recalculate entire covariance structure [Sigma, mu, lml] = self.epComputeParams(K, KI, g) #create a copy for damping g2 = g.copy() pass if nep == (self.Nep - 1): #LG.warn('maximum number of EP iterations reached') pass #update site parameters self.muEP = g[:, 0] / g[:, 1] self.vEP = 1 / g[:, 1]
def precompute(self): # collect lines for Graphlab graph definition file for full rank case graphlab_lines=GraphlabLines(output_filename=self.output_filename) # compute all non-symmetric kernels for incoming messages at a node print "precomputing (non-symmetric) kernels for incoming messages at a node" graphlab_lines.lines.append("# non-observed nodes") for node in self.graph: added_node=False for in_message in self.graph[node]: for out_message in self.graph[node]: if in_message==out_message: continue # dont add nodes which have no kernels, and only do once if they have if not added_node: graphlab_lines.new_non_observed_node(node) added_node=True edge_in_message=(node, in_message) edge_out_message=(out_message, node) lhs=self.data[edge_in_message][0] rhs=self.data[edge_out_message][1] lhs=reshape(lhs, (len(lhs),1)) rhs=reshape(rhs, (len(rhs),1)) K=self.kernel.kernel(lhs,rhs) graphlab_lines.add_non_observed_node(node, out_message, in_message, K) print "precomputing kernel (vectors) at observed nodes" graphlab_lines.lines.append(os.linesep + "# observed nodes") for node, observation in self.observations.items(): graphlab_lines.new_observed_node(node) for out_message in self.graph[node]: edge=(out_message, node) lhs=self.data[edge][1] lhs=reshape(lhs, (len(lhs), 1)) rhs=[[observation]] K=self.kernel.kernel(lhs, rhs) graphlab_lines.add_observed_node(node, out_message, K) # now precompute systems for inference print "precomputing systems for messages from observed nodes" graphlab_lines.lines.append(os.linesep + "# edges with observed targets") for node, observation in self.observations.items(): for out_message in self.graph[node]: edge=(out_message, node) graphlab_lines.new_edge_observed_target(node, out_message) data_source=self.data[edge][0] data_source=reshape(data_source, (len(data_source), 1)) data_target=self.data[edge][1] data_target=reshape(data_target, (len(data_target), 1)) Ks=self.kernel.kernel(data_source) Kt=self.kernel.kernel(data_target) Ls=cholesky(Ks+eye(shape(Ks)[0])*self.reg_lambda) Lt=cholesky(Kt+eye(shape(Kt)[0])*self.reg_lambda) graphlab_lines.add_edge(node, out_message,"L_s", Ls) graphlab_lines.add_edge(node, out_message,"L_t", Lt) print "precomputing systems for messages from non-observed nodes" graphlab_lines.lines.append(os.linesep + "# edges with non-observed targets") for edge in self.edges: # exclude edges which involve observed nodes is_edge_target_observed=len(Set(self.observations.keys()).intersection(Set(edge)))>0 if not is_edge_target_observed: graphlab_lines.new_edge_observed_target(edge[1], edge[0]) data_source=self.data[edge][0] data_source=reshape(data_source, (len(data_source), 1)) Ks=self.kernel.kernel(data_source) Ls=cholesky(Ks+eye(shape(Ks)[0])*self.reg_lambda) graphlab_lines.add_edge(edge[1], edge[0],"L_s", Ls) # write graph definition file to disc graphlab_lines.flush()