def _build_cross_ent(self, weights, means, covars, kernel_chol): cross_ent = 0.0 for i in xrange(self.num_components): sum_val = 0.0 for j in xrange(self.num_latent): if self.diag_post: # TODO(karl): this is a bit inefficient since we're not making use of the fact # that covars is diagonal. A solution most likely involves a custom tf op. trace = tf.trace( tf.cholesky_solve(kernel_chol[j, :, :], tf.diag(covars[i, j, :]))) else: trace = tf.reduce_sum( util.diag_mul( tf.cholesky_solve(kernel_chol[j, :, :], covars[i, j, :, :]), tf.transpose(covars[i, j, :, :]))) sum_val += (util.CholNormal( means[i, j, :], kernel_chol[j, :, :]).log_prob(0.0) - 0.5 * trace) cross_ent += weights[i] * sum_val return cross_ent
def cholesky_solve(chol, rhs, name=None): """Broadcasting batch cholesky solve.""" try: return tf.cholesky_solve(chol, rhs, name=name) except ValueError: chol, rhs = tf_utils.broadcast_outer_dims((chol, 2), (rhs, 2)) return tf.cholesky_solve(chol, rhs, name=name)
def calculate_factorizations(self): batched_eye = tf.eye(self.num_induced_points, batch_shape=[self.num_outputs], dtype=float_type) # TODO: Change 1e-6 to the respective constant of GPflow Kmm = self.K(self.Z) + 1e-6 * batched_eye Kmn = self.K(self.Z, self.X) L = tf.cholesky(Kmm) V = tf.matrix_triangular_solve(L, Kmn) G = self.variance[:, None] - tf.reduce_sum(tf.square(V), axis=[1]) G = tf.sqrt(1.0 + G / self.noise[:, None]) V = V / G[:, None] Am = tf.cholesky(tf.matmul(V, V, transpose_b=True) + \ self.noise[:, None, None] * batched_eye) At = tf.matmul(L, Am) iAt = tf.matrix_triangular_solve(At, batched_eye) Y_ = tf.transpose(self.Y)[:, :, None] beta = tf.matrix_triangular_solve(L, tf.cholesky_solve( Am, (V / G[:, None]) @ Y_), adjoint=True)[:, :, 0] iB = tf.matmul(iAt, iAt, transpose_a=True) * self.noise[:, None, None] iK = tf.cholesky_solve(L, batched_eye) - iB return iK, beta
def gauss_kl(min_q_mu, q_sq,K): q_mu=-1*min_q_mu #q_sqrt=tf.cholesky(tf.squeeze(q_sqrt)) # K is a variance...we sqrt later ''' N=1 Q=5 q_mu=tf.random_normal([Q,1],dtype=tf.float64) q_var=tf.random_normal([Q,Q],dtype=tf.float64) q_var=q_var+tf.transpose(q_var [1,0])+1e+1*np.eye(Q) K=q_var q_sqrt=tf.cholesky(q_var) q_sqrt=tf.expand_dims(q_sqrt,-1) num_latent=1 s=tf.Session() s.run(tf.initialize_all_variables()) ''' """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). q_sqrt=tf.cholesky(K) L = tf.cholesky(q_sq) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0], tf.float64) Lq = tf.batch_matrix_band_part(q_sqrt, -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term """ V2=tf.cholesky(K) V1=tf.cholesky(q_sq) KL=h.Mul(tf.transpose(q_mu),tf.cholesky_solve(V2,q_mu)) KL+=tf.trace(tf.cholesky_solve(V2,q_sq)) KL-=h.get_dim(K,0) KL+=tf.reduce_sum(2*tf.log(tf.diag_part(V2))-2*tf.log(tf.diag_part(V1))) return KL/2
def F_bound2_v2(y,S,Kmm,Knm,Kmnnm,Tr_Knn,sigma): #matrices to be used N=get_dim(y,0) Kmm_chol=tf.cholesky(Kmm) Q_nn=tf.square(sigma)*np.eye(N)+Mul(Knm,tf.cholesky_solve(Kmm_chol,tf.transpose(Knm))) bound=-0.5*(Tr_Knn-tf.trace(tf.cholesky_solve(Kmm_chol,Kmnnm)))/tf.square(sigma) bound+=multivariate_normal(y, tf.zeros([N,1],dtype=tf.float32), tf.cholesky(Q_nn)) return bound
def calculate_factorizations(self): K = self.K(self.X) # self.K: kernel function batched_eye = tf.eye(tf.shape(self.X)[0], batch_shape=[self.num_outputs], dtype=float_type) # construct a batch of num_outputs identity matrices, each[self.X.shape[0],self.X.shape[0]) L = tf.cholesky(K + self.noise[:, None, None]*batched_eye) # Cholesky decomposition iK = tf.cholesky_solve(L, batched_eye) # K @ iK = batched_eye Y_ = tf.transpose(self.Y)[:, :, None] # Why do we transpose Y? Maybe we need to change the definition of self.Y() or beta? beta = tf.cholesky_solve(L, Y_)[:, :, 0] # K @ beta = Y_ return iK, beta
def calculate_factorizations(self): K = self.K(self.X) batched_eye = tf.eye(tf.shape(self.X)[0], batch_shape=[self.num_outputs], dtype=float_type) L = tf.cholesky(K + self.noise[:, None, None]*batched_eye) iK = tf.cholesky_solve(L, batched_eye) Y_ = tf.transpose(self.Y)[:, :, None] # Why do we transpose Y? Maybe we need to change the definition of self.Y() or beta? beta = tf.cholesky_solve(L, Y_)[:, :, 0] # beta from paper return iK, beta
def _build_cross_ent(self, weights, means, covars, link_covars, kernel_chol, kernlink_chol): cross_ent = 0.0 for i in range(self.num_components): sum_val = 0.0 for r in range(self.num_block): dim_block = len(self.block_struct[r]) # construct Khh^-1 if dim_block == 1: # convert float dummy==1.0 to rank 2 tensor Khh_inv = tf.expand_dims( tf.expand_dims(kernlink_chol[r], 0), 1) log_det = util.log_cholesky_det(kernel_chol[r]) else: Khh_inv = tf.cholesky_solve(kernlink_chol[r], tf.eye(dim_block)) # construct ln|Kr_uu| log_det = self.num_inducing * util.log_cholesky_det(kernlink_chol[r]) + \ dim_block * util.log_cholesky_det(kernel_chol[r]) # calculate m_r'(Kuu^-1)m_r means_r = [ tf.expand_dims(means[i, j, :], 1) for j in self.block_struct[r] ] quad_form = 0.0 for j in range(dim_block): sum_means = tf.add_n( [Khh_inv[j, h] * means_r[h] for h in range(dim_block)]) quad_form += tf.reduce_sum( means_r[j] * tf.cholesky_solve(kernel_chol[r], sum_means)) # calculate trace[(Kuu^-1)Sk_r] if self.diag_post: # where Sk_r diagonal, trace reduces to sum of diagonal inner products over j in block r, # scaled by Khh_inv[j,j] diag_inv = tf.diag_part( tf.cholesky_solve(kernel_chol[r], tf.eye(self.num_inducing))) cov_diag = [covars[i, j, :] for j in self.block_struct[r]] trace = tf.reduce_sum(diag_inv * tf.add_n([ Khh_inv[j, j] * cov_diag[j] for j in range(dim_block) ])) else: trace = tf.trace(tf.matmul(Khh_inv, tf.matmul(link_covars[i][r], link_covars[i][r], transpose_b=True))) * \ tf.trace(tf.matmul(tf.cholesky_solve(kernel_chol[r], covars[i, r, :, :]), covars[i, r, :, :], transpose_b=True)) sum_val += dim_block * self.num_inducing * tf.log( 2.0 * np.pi) + log_det + quad_form + trace cross_ent += -0.5 * weights[i] * sum_val return cross_ent
def call(self, v): if self.constrained: z = tf.cholesky_solve(self.chol, self.h + v) y = tf.cholesky_solve(self.chol_constraint, tf.matmul(self.C, z) - self.d) return tf.cholesky_solve( self.chol, self.h + v - tf.matmul(self.C, y, transpose_a=True)) else: return tf.cholesky_solve(self.chol, self.h + v)
def calculate_factorizations(self): K = self.K(self.X) batched_eye = tf.eye(tf.shape(self.X)[0], batch_shape=[self.num_outputs], dtype=float_type) L = tf.cholesky(K + self.noise[:, None, None] * batched_eye) iK = tf.cholesky_solve(L, batched_eye) Y_ = tf.transpose(self.Y)[:, :, None] beta = tf.cholesky_solve(L, Y_)[:, :, 0] return iK, beta
def myopicController_noBdiff(X_est,PI_est,Control,gamma,true_model_est, true_model_est_null,target_model_est,xdim,udim): #graphs for updating state and observation, but B is not differentiable with respect to state #true_model_est: state est. gradient, controlled dyamics, must depend upon X_plus, Control #true_model_est_null: state est. gradient null control, must depend upon X_plus #target_model_est: state. est. target dynamics, must depend upon X_plus #control coupling matrix, evaluated at state estimate, NOT TRUE STATE B = grad_elemwise(true_model_est,Control) #first expected term E(B^T B) + gamma I #gamma = 1e-4 #regularization term #(B^T B) + gamma I exp1_1 = tf.matmul(tf.transpose(B),B)+gamma*np.eye(xdim,xdim) #B^T* (f-g) exp2_1 = mvMul(tf.transpose(B),tf.squeeze(true_model_est_null-target_model_est)) #0.25* B^T* Tr_{2,3}([f''-g'']Sigma) Pistack3 = tf.stack([PI_est,PI_est]) fdp = hess_elemwise(true_model_est,X_est) gdp = hess_elemwise(target_model_est,X_est) exp2_2 = 0.25*mvMul(tf.transpose(B),tf.trace(tf.matmul((fdp-gdp),Pistack3))) exp1_approx_meanonly = exp1_1 exp2_approx_meanonly = exp2_1+exp2_2 #Control_new = -1.0*mvMul(tf.matrix_inverse(exp1_approx_meanonly),exp2_approx_meanonly) #avoid matrix inversion #Control_new = tf.squeeze( # tf.matrix_solve(exp1_approx_meanonly,-1.0*tf.expand_dims(exp2_approx_meanonly,1))) Control_new = tf.squeeze( tf.cholesky_solve(tf.cholesky(exp1_approx_meanonly),-1.0*tf.expand_dims(exp2_approx_meanonly,1))) return Control_new
def myopicController_meanonly(X_est,PI_est,Control,gamma,true_model_est, true_model_est_null,target_model_est,xdim,udim): #graphs for updating state and observation #true_model_est: state est. gradient, controlled dyamics, must depend upon X_plus, Control #true_model_est_null: state est. gradient null control, must depend upon X_plus #target_model_est: state. est. target dynamics, must depend upon X_plus #control coupling matrix, evaluated at state estimate, NOT TRUE STATE B = grad_elemwise(true_model_est,Control) #first expected term E(B^T B) + gamma I #gamma = 1e-4 #regularization term #(B^T B) + gamma I exp1_1 = tf.matmul(tf.transpose(B),B)+gamma*np.eye(xdim,xdim) #B'^T* (f-g) exp2_1 = mvMul(tf.transpose(B),tf.squeeze(true_model_est_null-target_model_est)) exp1_approx_meanonly = exp1_1 exp2_approx_meanonly = exp2_1 #Control_new = -1.0*mvMul(tf.matrix_inverse(exp1_approx_meanonly),exp2_approx_meanonly) #avoid matrix inversion #Control_new = tf.squeeze( # tf.matrix_solve(exp1_approx_meanonly,-1.0*tf.expand_dims(exp2_approx_meanonly,1))) Control_new = tf.squeeze( tf.cholesky_solve(tf.cholesky(exp1_approx_meanonly),-1.0*tf.expand_dims(exp2_approx_meanonly,1))) return Control_new
def nlml(self,Xu,Xf,Yu1, Yu2, Yu3, Yf,dt, hyp1, hyp3, hyp5, sig_n, lambda1, lambda2, un_u, un_f, kernel_type, jitter=1.0e-10): # negative logarithm marginal-likelihood # sess1 = tf.Session() # sess1.run(tf.global_variables_initializer()) # xf_train = np.linspace(-8.0,8.0,self.Nf+2)[1:-1].reshape((-1,1)) # # # yf_train = self.u_exact(xf_train,0.0, u_exa, t_exa, x_exa, 2) # #yf_train = yf_train+np.linalg.cholesky(previous_cov_mat[:Nf,:Nf])@ np.random.randn(Nf,1) # # xu_train = np.array([[-8.0], [8.0]],dtype=np.float64) # # Nu = xu_train.shape[0] # Nf = xf_train.shape[0] # # # # un_u = self.u_exact(xu_train,init_time,u_exa, t_exa, x_exa, 1) # un_f = yf_train N = 3*(self.Nu + self.Nf) self.K0 = self.kernel_uf_train(Xu,Xf,self.Nu, self.Nf, hyp1,hyp3,hyp5,self.a, self.b, self.c, lambda1, lambda2, un_u, un_f, dt) # self.K0 = self.kernel_uf_train(xu_train,xf_train,self.Nu, self.Nf, hyp1,hyp3,hyp5,self.a, self.b, self.c, lambda1, lambda2, un_u, un_f, dt) K = self.K0 + (sig_n**2+jitter)*tf.eye(N,dtype=tf.float64) self.L = tf.cholesky(K) r = tf.concat((Yu1,Yu2,Yu3,Yf,Yf,Yf),axis=0) self.alpha = tf.cholesky_solve(self.L, r) self.sig2_tf = tf.matmul(r, self.alpha, transpose_a=True)/N return 0.5 * N * tf.log(2.0*np.pi*self.sig2_tf)\ +tf.reduce_sum(tf.log(tf.diag_part(self.L))) \ + N/2.0
def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ self.build_cholesky_if_needed() KL = -0.5 * self.num_inducing * self.num_nodes * self.dim_per_out for nd in range(self.num_nodes): q_sqrt_nd = self.q_sqrt_lst[nd] with params_as_tensors_for(q_sqrt_nd, convert=True): KL -= 0.5 * tf.reduce_sum( tf.log(tf.matrix_diag_part(q_sqrt_nd)**2)) KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu[nd]))) * self.dim_per_out KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled_lst[nd], q_sqrt_nd, lower=True))) q_mu_nd = self.q_mu[:, nd * self.dim_per_out:(nd + 1) * self.dim_per_out] Kinv_m_nd = tf.cholesky_solve(self.Lu[nd], q_mu_nd) KL += 0.5 * tf.reduce_sum(q_mu_nd * Kinv_m_nd) return KL
def _build_interim_vals(self, kernel_chol, inducing_inputs, train_inputs): """Helper function for `_build_ell` Args: kernel_chol: Tensor(num_latents, num_inducing, num_inducing) inducing_inputs: Tensor(num_latents, num_inducing, input_dim) train_inputs: Tensor(batch_size, input_dim) Returns: `kern_prods` (num_latents, batch_size, num_inducing) and `kern_sums` (num_latents, batch_size) """ # shape of ind_train_kern: (num_latents, num_inducing, batch_size) kern_prods = [0.0 for _ in range(self.num_latents)] kern_sums = [0.0 for _ in range(self.num_latents)] for i in range(self.num_latents): ind_train_kern = self.cov[i].cov_func(inducing_inputs[i, :, :], train_inputs) # Compute A = Kxz.Kzz^(-1) = (Kzz^(-1).Kzx)^T. kern_prods[i] = tf.transpose( tf.cholesky_solve(kernel_chol[i, :, :], ind_train_kern)) # We only need the diagonal components. kern_sums[i] = (self.cov[i].diag_cov_func(train_inputs) - util.mul_sum(kern_prods[i], tf.matrix_transpose(ind_train_kern))) kern_prods = tf.stack(kern_prods, 0) kern_sums = tf.stack(kern_sums, 0) return kern_prods, kern_sums
def mlpg_univariate(means, stds, weights): """Generate a trajectory out of a time sequence of gaussian parameters. The algorithm used is taken from Tokuda, K. et alii (2000). Speech Parameter Generation Algorithms for HMM-based speech synthesis. It aims at generating the most likely trajectory sequence based on gaussian parameters fitted to an input sequence of some kind. means : time sequence of means (1-D tensor) stds : time sequence of standard deviations (1-D tensor) weights : matrix of weights to derive successive orders of dynamic features out of static ones (2-D tensor) The means and standard deviations should consist of the time sequence of parameters for static features first, followed by the time sequence of delta features parameters and finally by that of delta delta features parameters. """ # Test arguments' rank validity. tf.control_dependencies([ tf.assert_rank(means, 1), tf.assert_rank(stds, 1), tf.assert_rank(weights, 2) ]) # Compute the terms of the parameters generation system. inv_stds = tf.matrix_diag(1 / (tf.square(stds) + 1e-30)) timed_variance = tf.matmul(tf.matrix_transpose(weights), inv_stds) left_term = tf.matmul(timed_variance, weights) right_term = tf.matmul(timed_variance, tf.expand_dims(means, 1)) # Solve the system using cholesky decomposition. static_features = tf.cholesky_solve(tf.cholesky(left_term), right_term) # Add dynamic features to the predicted static ones and return them. return tf.matmul(weights, static_features)
def create_prediction(tf_input): t_AInv = tf.cholesky_solve(t_L_aa, tf.eye(t_M, dtype=dtype)) t_PredictAlpha = (1.0 / t_beta) * tf.matrix_triangular_solve( t_L_uu, tf.matmul(t_AInv, t_GammaT), lower=True, adjoint=True) t_K_x_Z = kernel.covar_matrix(tf_input, t_Z) t_y_mean = t_beta * tf.matmul(t_K_x_Z, t_PredictAlpha) t_K_x_x_diag = kernel.covar_diag(tf_input) t_L_uuInv_K_Z_x = tf.matrix_triangular_solve(t_L_uu, tf.transpose(t_K_x_Z), lower=True) t_G = (1.0 / t_beta) * t_AInv - tf.eye(t_M, dtype=dtype) t_y_var = t_K_x_x_diag \ + tf.reduce_sum(t_L_uuInv_K_Z_x * tf.matmul(t_G, t_L_uuInv_K_Z_x), axis=0) \ + (1.0 / t_beta) * tf.ones([tf.shape(tf_input)[0]], dtype=dtype) t_y_var = t_y_var[:, tf.newaxis] return t_y_mean, t_y_var
def add_pair_lamd(self, static, moving, n_neighbors): lfd1 = 1 lfd2 = 0.05 RotationTiled = tf.tile(tf.expand_dims(self.Rotation, 0), (moving.NCELLS, 1, 1)) TransformedMeans = tf.matmul(RotationTiled, tf.expand_dims(moving.Means, -1)) TransformedMeans = tf.squeeze(TransformedMeans, -1) + self.Translation TransformedCovars = tf.matmul( tf.matmul(RotationTiled, moving.Covariances, transpose_a=True), RotationTiled) Distances = tf.expand_dims(TransformedMeans, 1) - tf.expand_dims( static.Means, 0) CSum = tf.expand_dims(TransformedCovars, 1) + tf.expand_dims( static.Covariances, 0) MCov = tf.reshape( tf.tile(tf.expand_dims(moving.Covariances, 1), (1, 2, 1, 1)), [-1, 3, 3]) Distances, CSum = self.n_nearest(Distances, CSum, n_neighbors) #Instead of inverse, cholesky decomposition #CInv = tf.matrix_inverse(CSum) with tf.device('/device:CPU:0'): CInv = tf.cholesky_solve( CSum, tf.tile(tf.expand_dims(tf.eye(3), 0), (tf.shape(CSum)[0], 1, 1))) m_ij = tf.expand_dims(Distances, 2) l = tf.matmul(tf.matmul(m_ij, CInv, transpose_a=True), m_ij) likelihood = tf.exp(-lfd2 * l / 2) loss = -lfd1 * tf.reduce_sum(likelihood) G, H = gradients(m_ij, CInv, MCov, likelihood, lfd2, self.PARAMS[3:]) return loss, G, H
def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ # if self.white: # return gauss_kl(self.q_mu, self.q_sqrt) # else: # return gauss_kl(self.q_mu, self.q_sqrt, self.Ku) self.build_cholesky_if_needed() KL = -0.5 * self.num_outputs * self.num_inducing KL -= 0.5 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.q_sqrt)**2)) if not self.white: KL += tf.reduce_sum(tf.log(tf.matrix_diag_part( self.Lu))) * self.num_outputs KL += 0.5 * tf.reduce_sum( tf.square( tf.matrix_triangular_solve( self.Lu_tiled, self.q_sqrt, lower=True))) Kinv_m = tf.cholesky_solve(self.Lu, self.q_mu) KL += 0.5 * tf.reduce_sum(self.q_mu * Kinv_m) else: KL += 0.5 * tf.reduce_sum(tf.square(self.q_sqrt)) KL += 0.5 * tf.reduce_sum(self.q_mu**2) return KL
def build_backward_variance(self, Yvar): """ Additional method for scaling variance backward (used in :class:`.Normalizer`). Can process both the diagonal variances returned by predict_f, as well as full covariance matrices. :param Yvar: size N x N x P or size N x P :return: Yvar scaled, same rank and size as input """ rank = tf.rank(Yvar) # Because TensorFlow evaluates both fn1 and fn2, the transpose can't be in the same line. If a full cov # matrix is provided fn1 turns it into a rank 4, then tries to transpose it as a rank 3. # Splitting it in two steps however works fine. Yvar = tf.cond(tf.equal(rank, 2), lambda: tf.matrix_diag(tf.transpose(Yvar)), lambda: Yvar) Yvar = tf.cond(tf.equal(rank, 2), lambda: tf.transpose(Yvar, perm=[1, 2, 0]), lambda: Yvar) N = tf.shape(Yvar)[0] D = tf.shape(Yvar)[2] L = tf.cholesky(tf.square(tf.transpose(self.A))) Yvar = tf.reshape(Yvar, [N * N, D]) scaled_var = tf.reshape( tf.transpose(tf.cholesky_solve(L, tf.transpose(Yvar))), [N, N, D]) return tf.cond(tf.equal(rank, 2), lambda: tf.reduce_sum(scaled_var, axis=1), lambda: scaled_var)
def build_backward(self, Y): """ TensorFlow implementation of the inverse mapping """ L = tf.cholesky(tf.transpose(self.A)) XT = tf.cholesky_solve(L, tf.transpose(Y-self.b)) return tf.transpose(XT)
def inference(self, features, outputs, is_train): """Build graph for computing predictive mean and variance and negative log probability. Args: train_inputs: inputs train_outputs: targets is_train: whether we're training Returns: negative log marginal likelihood """ inputs = features['input'] assignments = [] if is_train: # During training, we have to store the training data to compute predictions later on assignments.append(self.train_inputs.assign(inputs)) assignments.append(self.train_outputs.assign(outputs)) with tf.control_dependencies( assignments): # this ensures that the assigments are executed chol, alpha = self._build_interim_vals(inputs, outputs) # precision = inv(kxx) precision = tf.cholesky_solve(chol, tf.eye(tf.shape(inputs)[-2])) precision_diag = tf.matrix_diag_part(precision) loo_fmu = outputs - alpha / precision_diag # GMPL book eq. 5.12 loo_fs2 = 1.0 / precision_diag # GMPL book eq. 5.12 # log probability (lp), also called log pseudo-likelihood) lp = self._build_loo(outputs, loo_fmu, loo_fs2) return {'loss': -lp, 'LP': lp}, []
def nlml(self, Xu, Xf, Yu, Yf, dt, hyp1, hyp3, hyp5, sig_n, lambda1, lambda2, un_u, un_f, kernel_type, jitter=1.0e-10): # negative logarithm marginal-likelihood Nu = Xu[0].shape[0] + Xu[1].shape[0] + Xu[2].shape[0] N = Nu + 3 * Xf.shape[0] self.K0 = self.kernel_uf_train(Xu, Xf, hyp1, hyp3, hyp5, self.a, self.b, self.c, lambda1, lambda2, un_u, un_f, dt) K = self.K0 + (sig_n**2 + jitter) * tf.eye(N, dtype=tf.float64) self.L = tf.cholesky(K) r = np.concatenate((Yu[0],Yu[1],Yu[2],Yf,Yf,Yf),axis=0)\ - np.concatenate((np.zeros((Nu,1),dtype=np.float64), self.prior_mean_train[0], self.prior_mean_train[1], self.prior_mean_train[2]),axis=0) self.alpha = tf.cholesky_solve(self.L, r) self.sig2_tf = tf.matmul(r, self.alpha, transpose_a=True) / N return 0.5 * N * tf.log(2.0*np.pi*self.sig2_tf)\ +tf.reduce_sum(tf.log(tf.diag_part(self.L))) \ + N/2.0
def _expectation(p, mean, none, kern, feat, nghp=None): """ Compute the expectation: expectation[n] = <x_n K_{x_n, Z}>_p(x_n) - K_{.,.} :: RBF kernel :return: NxDxM """ Xmu, Xcov = p.mu, p.cov with tf.control_dependencies([tf.assert_equal( tf.shape(Xmu)[1], tf.constant(kern.input_dim, settings.tf_int), message="Currently cannot handle slicing in exKxz.")]): Xmu = tf.identity(Xmu) with params_as_tensors_for(kern, feat): D = tf.shape(Xmu)[1] lengthscales = kern.lengthscales if kern.ARD \ else tf.zeros((D,), dtype=settings.float_type) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(feat.Z) - tf.expand_dims(Xmu, 2) # NxDxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N exponent_mahalanobis = tf.cholesky_solve(chol_L_plus_Xcov, all_diffs) # NxDxM non_exponent_term = tf.matmul(Xcov, exponent_mahalanobis, transpose_a=True) non_exponent_term = tf.expand_dims(Xmu, 2) + non_exponent_term # NxDxM exponent_mahalanobis = tf.reduce_sum(all_diffs * exponent_mahalanobis, 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM return kern.variance * (determinants[:, None] * exponent_mahalanobis)[:, None, :] * non_exponent_term
def update_W_external(self, X, Y): Kdiag = self.kern.Kdiag(X, full_output_cov=False) Kux = features.Kuf(self.feature, self.kern, X) Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) # Copy this into blocks for each dimension Kuu = features.Kuu(self.feature, self.kern, jitter=settings.jitter) L = tf.cholesky(Kuu) sigma2 = self.likelihood.variance A = tf.cholesky_solve(L, Kux) # K x M x N mean = tf.matmul(A, tf.transpose(self.q_mu)[:, :, None], transpose_a=True) err = (Y - mean) reg1 = tf.reduce_sum( tf.pow(tf.matmul(A, self.q_sqrt, transpose_a=True), 2), 2) reg2 = tf.transpose(Kdiag) - tf.einsum('kmn,kmn->kn', A, Kux) logW = -0.5 * tf.log(2 * np.pi * sigma2) \ - 0.5 * tf.reduce_sum(tf.pow(err, 2), 2) / sigma2 \ - 0.5 * reg1 / sigma2 - 0.5 * reg2 / sigma2 + tf.log(self.W_prior)[:, None] logW = logW - tf.reduce_logsumexp(logW, axis=0, keepdims=True) return tf.transpose(logW)
def Bound2(phi_0, phi_1, phi_2, sigma_noise, K_mm, mean_y): # Preliminary Bound beta = 1 / tf.square(sigma_noise) bound = 0 N = h.get_dim(mean_y, 0) M = h.get_dim(K_mm, 0) W_inv_part = beta * phi_2 + K_mm global phi_200 phi_200 = tf.matrix_solve(W_inv_part, tf.transpose(phi_1)) W = beta * np.eye(N) - tf.square(beta) * h.Mul( phi_1, tf.matrix_solve(W_inv_part, tf.transpose(phi_1))) # Computations bound += N * tf.log(beta) bound += h.log_det(K_mm + 1e-3 * np.eye(M)) bound -= h.Mul(tf.transpose(mean_y), W, mean_y) global matrix_determinant matrix_determinant = tf.ones( 1 ) #h.log_det(W_inv_part+1e2*np.eye(M))#-1e-40*tf.exp(h.log_det(W_inv_part)) bound -= h.log_det(W_inv_part + 1e-3 * tf.reduce_mean(W_inv_part) * np.eye(M)) bound -= beta * phi_0 bound += beta * tf.trace(tf.cholesky_solve(tf.cholesky(K_mm), phi_2)) bound = bound * 0.5 return bound
def _kernel(self, X1, X2, jitter=False, debug=False): self.inducing_locations = self.context.parameters.get( name='inducing_locations_{r}'.format( r=self.context.use_latent_f_direction)) k_g_zz = self.K1.k1.kernel(self.inducing_locations, self.inducing_locations, jitter=True) k_g_z_x2 = self.K2.k1.kernel(self.inducing_locations, X2) k_g_x1_z = self.K2.k1.kernel(X1, self.inducing_locations) #k_g_x1_x2 = self.K2.kernel(X1, X2) #k_2_x1_x2 = self.K1.kernel(X1, X2) K = tf.matmul(k_g_x1_z, tf.cholesky_solve(tf.cholesky(k_g_zz), k_g_z_x2)) if False: K = tf.Print(K, [X1], 'X1') K = tf.Print(K, [k_g_x1_z], 'k_g_x1_z_latent') K = tf.Print(K, [k_g_z_x2], 'k_g_z_x2_latent') K = tf.Print(K, [K], 'K_latent', summarize=500) #K = k_2_x1_x2 #K = k_2_x1_x2 K = self.K1.k1.kernel(X1, X2) K = tf.multiply(K, self.K2.k2.kernel(X1, X2, jitter=jitter)) #K = k_g_x1_x2 if jitter: K = util.add_jitter(K, self.context.jitter) return K
def _expectation(p, mean, none, kern, feat, nghp=None): """ Compute the expectation: expectation[n] = <x_n K_{x_n, Z}>_p(x_n) - K_{.,.} :: RBF kernel :return: NxDxM """ Xmu, Xcov = p.mu, p.cov with tf.control_dependencies([tf.assert_equal( tf.shape(Xmu)[1], tf.constant(kern.input_dim, settings.tf_int), message="Currently cannot handle slicing in exKxz.")]): Xmu = tf.identity(Xmu) with params_as_tensors_for(kern), params_as_tensors_for(feat): D = tf.shape(Xmu)[1] lengthscales = kern.lengthscales if kern.ARD \ else tf.zeros((D,), dtype=settings.float_type) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(feat.Z) - tf.expand_dims(Xmu, 2) # NxDxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N exponent_mahalanobis = tf.cholesky_solve(chol_L_plus_Xcov, all_diffs) # NxDxM non_exponent_term = tf.matmul(Xcov, exponent_mahalanobis, transpose_a=True) non_exponent_term = tf.expand_dims(Xmu, 2) + non_exponent_term # NxDxM exponent_mahalanobis = tf.reduce_sum(all_diffs * exponent_mahalanobis, 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM return kern.variance * (determinants[:, None] * exponent_mahalanobis)[:, None, :] * non_exponent_term
def propagate(self, mu, Sigma): """ Implementation of the function propagate required by GDSM. (see template.py) """ do_batch = mu.get_shape().ndims != 2 batch_shape = tf.shape(mu)[:-2] l, L, H = self._propagation_terms(mu, Sigma) # Compute predicted mean l_rank2 = l if not do_batch else tf.reshape(l, [-1, self._n]) m = tf.matmul(l_rank2, self._beta) if do_batch: m = tf.reshape(m, tf.concat([batch_shape, [self.output_dim, 1]], 0)) else: m = tf.transpose(m) L_rank2 = L if not do_batch else tf.reshape(L, [-1, self._n]) # Compute predicted output variance temp = tf.matmul(L_rank2, self._beta) if do_batch: temp = tf.reshape(temp, [-1, self._n, self.output_dim]) temp = tf.matrix_transpose(temp) temp = tf.reshape(temp, [-1, self._n]) temp = tf.matmul(temp, self._beta) temp = tf.reshape( temp, tf.concat([batch_shape, [self.output_dim, self.output_dim]], 0)) L_rank2 = tf.transpose(L_rank2) else: temp = tf.matmul(temp, self._beta, transpose_a=True) C = temp - tf.matmul(m, m, transpose_b=True) temp = tf.cholesky_solve(self._Kchol, L_rank2) if do_batch: temp = tf.reshape( tf.transpose(temp), tf.concat([batch_shape, [1, 1, self._n, self._n]], 0)) C += self._Iout * \ (self._sigma2 + self._sigma_noise2 - tf.trace(temp)) # Compute input/output covariance C_oi = tf.matmul(l * tf.transpose(self._beta), H - mu, transpose_b=True) if self._id_mean: m += mu C += Sigma + C_oi + tf.matrix_transpose(C_oi) C_oi += Sigma return m, C, C_oi
def _expectation(p, rbf_kern, feat1, lin_kern, feat2, nghp=None): """ Compute the expectation: expectation[n] = <Ka_{Z1, x_n} Kb_{x_n, Z2}>_p(x_n) - K_lin_{.,.} :: RBF kernel - K_rbf_{.,.} :: Linear kernel Different Z1 and Z2 are handled if p is diagonal and K_lin and K_rbf have disjoint active_dims, in which case the joint expectations simplify into a product of expectations :return: NxM1xM2 """ if rbf_kern.on_separate_dims(lin_kern) and isinstance(p, DiagonalGaussian): # no joint expectations required eKxz1 = expectation(p, (rbf_kern, feat1)) eKxz2 = expectation(p, (lin_kern, feat2)) return eKxz1[:, :, None] * eKxz2[:, None, :] if feat1 != feat2: raise NotImplementedError("Features have to be the same for both kernels.") if rbf_kern.active_dims != lin_kern.active_dims: raise NotImplementedError("active_dims have to be the same for both kernels.") with params_as_tensors_for(rbf_kern), params_as_tensors_for(lin_kern), \ params_as_tensors_for(feat1), params_as_tensors_for(feat2): # use only active dimensions Xcov = rbf_kern._slice_cov(tf.matrix_diag(p.cov) if isinstance(p, DiagonalGaussian) else p.cov) Z, Xmu = rbf_kern._slice(feat1.Z, p.mu) N = tf.shape(Xmu)[0] D = tf.shape(Xmu)[1] lin_kern_variances = lin_kern.variance if lin_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + lin_kern.variance rbf_kern_lengthscales = rbf_kern.lengthscales if rbf_kern.ARD \ else tf.zeros((D,), dtype=settings.tf_float) + rbf_kern.lengthscales ## Begin RBF eKxz code: chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(rbf_kern_lengthscales ** 2) + Xcov) # NxDxD Z_transpose = tf.transpose(Z) all_diffs = Z_transpose - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(rbf_kern_lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N eKxz_rbf = rbf_kern.variance * (determinants[:, None] * exponent_mahalanobis) ## NxM <- End RBF eKxz code tiled_Z = tf.tile(tf.expand_dims(Z_transpose, 0), (N, 1, 1)) # NxDxM z_L_inv_Xcov = tf.matmul(tiled_Z, Xcov / rbf_kern_lengthscales[:, None] ** 2., transpose_a=True) # NxMxD cross_eKzxKxz = tf.cholesky_solve( chol_L_plus_Xcov, (lin_kern_variances * rbf_kern_lengthscales ** 2.)[..., None] * tiled_Z) # NxDxM cross_eKzxKxz = tf.matmul((z_L_inv_Xcov + Xmu[:, None, :]) * eKxz_rbf[..., None], cross_eKzxKxz) # NxMxM return cross_eKzxKxz
def _build_cross_entropy_sum(self, k1, m1, s1, n, debug=False): k_chol = tf.cholesky(k1) m1 = tf.expand_dims(m1, 1) d = tf.trace(tf.cholesky_solve(k_chol, s1)) p = util.log_normal_chol(x=0.0, mu=m1, chol=k_chol, n=n) result = p - 0.5 * d return result
def predict2(): # predicitions cov=h.Mul(K_mm_2,tf.matrix_inverse(K_mm_2+K_mnnm_2/tf.square(sigma_2)),K_mm_2) cov_chol=tf.cholesky(cov) mu=h.Mul(K_mm_2,tf.cholesky_solve(cov_chol,K_mn_2),Ytr)/tf.square(sigma_2) mean=h.Mul(K_nm_2,tf.matrix_solve(K_mm_1,mu)) variance=K_nn_2-h.Mul(K_nm_2,h.safe_chol(K_mm_2,tf.transpose(K_nm_2))) var_terms=2*tf.sqrt(tf.reshape(tf.diag_part(variance)+tf.square(sigma_2),[N,1])) return mean, var_terms
def _cho(): # batch_size, n, n L = tf.cholesky(Kf,name='L') # batch_size, n,1 alpha = tf.cholesky_solve(L, dy, name='alpha') data_fit = 0.5 * tf.reduce_sum(dy*alpha,axis=-1)[...,0] complexity = tf.trace(tf.log(L)) scale = 0.5*n*np.log(2.*np.pi) return data_fit + complexity + scale
def test_works_with_five_different_random_pos_def_matricies(self): with self.test_session(): for n in range(1, 6): for np_type in [np.float32, np.float64]: matrix = _random_pd_matrix(n, self.rng).astype(np_type) chol = tf.cholesky(matrix) for k in range(1, 3): rhs = self.rng.randn(n, k).astype(np_type) x = tf.cholesky_solve(chol, rhs) self.assertAllClose(rhs, tf.matmul(matrix, x).eval(), atol=1e-4)
def Bound1(y,S,Kmm,Knm,Tr_Knn,sigma): #matrices to be used Kmm_chol=tf.cholesky(Kmm) sig_2=tf.square(sigma) N=h.get_dim(y,0) Q_nn=h.Mul(Knm,tf.cholesky_solve(Kmm_chol,tf.transpose(Knm))) Q_I_chol=tf.cholesky(sig_2*np.eye(N)+Q_nn) bound=-0.5*(Tr_Knn-Q_nn)/sig_2 bound+=h.multivariate_normal(y, tf.zeros([N,1],dtype=tf.float32), Q_I_chol) bound-=0.5*tf.reduce_sum(S)/sig_2+0.1*0.5*tf.reduce_sum(tf.log(S)) return bound
def testDiffusionBehavesCorrectly(self): """Test that for the SGLD finds minimum of the 3D Gaussian energy.""" with self.test_session(graph=tf.Graph()) as sess: # Set up random seed for the optimizer tf.set_random_seed(42) dtype = np.float32 true_mean = dtype([0, 0, 0]) true_cov = dtype([[1, 0.25, 0.25], [0.25, 1, 0.25], [0.25, 0.25, 1]]) # Loss is defined through the Cholesky decomposition chol = tf.linalg.cholesky(true_cov) var_1 = tf.get_variable( 'var_1', initializer=[1., 1.]) var_2 = tf.get_variable( 'var_2', initializer=[1.]) var = tf.concat([var_1, var_2], axis=-1) # Partially defined loss function loss_part = tf.cholesky_solve(chol, tf.expand_dims(var, -1)) # Loss function loss = 0.5 * tf.squeeze(tf.matmul(loss_part, tf.expand_dims(var, -1), transpose_a=True)) # Set up the learning rate with a polynomial decay global_step = tf.Variable(0, trainable=False) starter_learning_rate = .3 end_learning_rate = 1e-4 decay_steps = 1e4 learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=1.) # Set up the optimizer optimizer_kernel = tfp.optimizer.StochasticGradientLangevinDynamics( learning_rate=learning_rate, preconditioner_decay_rate=0.99) optimizer = optimizer_kernel.minimize(loss) init = tf.global_variables_initializer() # Number of training steps training_steps = 5000 # Record the steps as and treat them as samples samples = [np.zeros([training_steps, 2]), np.zeros([training_steps, 1])] sess.run(init) for step in range(training_steps): sess.run([optimizer, loss]) sample = [sess.run(var_1), sess.run(var_2)] samples[0][step, :] = sample[0] samples[1][step, :] = sample[1] samples_ = np.concatenate(samples, axis=-1) sample_mean = np.mean(samples_, 0) self.assertAllClose(sample_mean, true_mean, atol=0.1, rtol=0.1)
def test_works_with_five_different_random_pos_def_matrices(self): with self.test_session(): for n in range(1, 6): for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]: # Create 2 x n x n matrix array = np.array( [_random_pd_matrix(n, self.rng), _random_pd_matrix(n, self.rng)] ).astype(np_type) chol = tf.cholesky(array) for k in range(1, 3): rhs = self.rng.randn(2, n, k).astype(np_type) x = tf.cholesky_solve(chol, rhs) self.assertAllClose(rhs, tf.matmul(array, x).eval(), atol=atol)
def Bound2(phi_0,phi_1,phi_2,sigma_noise,K_mm,mean_y): # Preliminary Bound beta=1/tf.square(sigma_noise) bound=0 N=h.get_dim(mean_y,0) M=h.get_dim(K_mm,0) W_inv_part=beta*phi_2+K_mm global phi_200 phi_200=tf.matrix_solve(W_inv_part,tf.transpose(phi_1)) W=beta*np.eye(N)-tf.square(beta)*h.Mul(phi_1,tf.matrix_solve(W_inv_part,tf.transpose(phi_1))) # Computations bound+=N*tf.log(beta) bound+=h.log_det(K_mm+1e-3*np.eye(M)) bound-=h.Mul(tf.transpose(mean_y),W,mean_y) global matrix_determinant matrix_determinant=tf.ones(1) #h.log_det(W_inv_part+1e2*np.eye(M))#-1e-40*tf.exp(h.log_det(W_inv_part)) bound-=h.log_det(W_inv_part+1e-3*tf.reduce_mean(W_inv_part)*np.eye(M)) bound-=beta*phi_0 bound+=beta*tf.trace(tf.cholesky_solve(tf.cholesky(K_mm),phi_2)) bound=bound*0.5 return bound
def safe_chol(A,RHS): conditioned=condition((A+tf.transpose(A))/2) chol=tf.cholesky(conditioned) return tf.cholesky_solve(chol,RHS)
def build_model(self): """Defines the GP model. The loss is computed for partial feedback settings (bandits), so only the observed outcome is backpropagated (see weighted loss). Selects the optimizer and, finally, it also initializes the graph. """ logging.info("Initializing model %s.", self.name) self.global_step = tf.train.get_or_create_global_step() # Define state for the model (inputs, etc.) self.x_train = tf.get_variable( "training_data", initializer=tf.ones( [self.hparams.batch_size, self.n_in], dtype=tf.float64), validate_shape=False, trainable=False) self.y_train = tf.get_variable( "training_labels", initializer=tf.zeros([self.hparams.batch_size, 1], dtype=tf.float64), validate_shape=False, trainable=False) self.weights_train = tf.get_variable( "weights_train", initializer=tf.ones( [self.hparams.batch_size, self.n_out], dtype=tf.float64), validate_shape=False, trainable=False) self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False) self.input_w_op = tf.assign( self.weights_train, self.weights, validate_shape=False) self.input_std = tf.get_variable( "data_standard_deviation", initializer=tf.ones([1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=False) self.input_mean = tf.get_variable( "data_mean", initializer=tf.zeros([1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=True) # GP Hyperparameters self.noise = tf.get_variable( "noise", initializer=tf.cast(0.0, dtype=tf.float64)) self.amplitude = tf.get_variable( "amplitude", initializer=tf.cast(1.0, dtype=tf.float64)) self.amplitude_linear = tf.get_variable( "linear_amplitude", initializer=tf.cast(1.0, dtype=tf.float64)) self.length_scales = tf.get_variable( "length_scales", initializer=tf.zeros([1, self.n_in], dtype=tf.float64)) self.length_scales_lin = tf.get_variable( "length_scales_linear", initializer=tf.zeros([1, self.n_in], dtype=tf.float64)) # Latent embeddings of the different outputs for task covariance self.task_vectors = tf.get_variable( "latent_task_vectors", initializer=tf.random_normal( [self.n_out, self.task_latent_dim], dtype=tf.float64)) # Normalize outputs across each dimension # Since we have different numbers of observations across each task, we # normalize by their respective counts. index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0), self.n_out) index_counts = tf.where(index_counts > 0, index_counts, tf.ones(tf.shape(index_counts), dtype=tf.float64)) self.mean_op = tf.assign(self.input_mean, tf.reduce_sum(self.y, axis=0) / index_counts) self.var_op = tf.assign( self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square( self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0) / index_counts)) with tf.control_dependencies([self.var_op]): y_normed = self.atleast_2d( (self.y - self.input_mean) / self.input_std, self.n_out) y_normed = self.atleast_2d(tf.boolean_mask(y_normed, self.weights > 0), 1) self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False) # Observation noise alpha = tf.nn.softplus(self.noise) + 1e-6 # Covariance with tf.control_dependencies([self.input_op, self.input_w_op, self.out_op]): self.self_cov = (self.cov(self.x_in, self.x_in) * self.task_cov(self.weights, self.weights) + tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * alpha) self.chol = tf.cholesky(self.self_cov) self.kinv = tf.cholesky_solve(self.chol, tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64)) self.input_inv = tf.Variable( tf.eye(self.hparams.batch_size, dtype=tf.float64), validate_shape=False, trainable=False) self.input_cov_op = tf.assign(self.input_inv, self.kinv, validate_shape=False) # Log determinant by taking the singular values along the diagonal # of self.chol with tf.control_dependencies([self.input_cov_op]): logdet = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(self.chol) + 1e-16)) # Log Marginal likelihood self.marginal_ll = -tf.reduce_sum(-0.5 * tf.matmul( tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet - 0.5 * self.n * np.log(2 * np.pi)) zero = tf.cast(0., dtype=tf.float64) one = tf.cast(1., dtype=tf.float64) standard_normal = tfd.Normal(loc=zero, scale=one) # Loss is marginal likelihood and priors self.loss = tf.reduce_sum( self.marginal_ll - (standard_normal.log_prob(self.amplitude) + standard_normal.log_prob(tf.exp(self.noise)) + standard_normal.log_prob(self.amplitude_linear) + tfd.Normal(loc=zero, scale=one * 10.).log_prob( self.task_vectors)) ) # Optimizer for hyperparameters optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr) vars_to_optimize = [ self.amplitude, self.length_scales, self.length_scales_lin, self.amplitude_linear, self.noise, self.input_mean ] if self.learn_embeddings: vars_to_optimize.append(self.task_vectors) grads = optimizer.compute_gradients(self.loss, vars_to_optimize) self.train_op = optimizer.apply_gradients(grads, global_step=self.global_step) # Predictions for test data self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x) # create tensorboard metrics self.create_summaries() self.summary_writer = tf.summary.FileWriter("{}/graph_{}".format( FLAGS.logdir, self.name), self.sess.graph) self.check = tf.add_check_numerics_ops()
def solve_linear(A, L, w_x, w_y): rhs = w_x - tf.matmul(A, w_y, transpose_a=True) z_x = tf.cholesky_solve(L, rhs) z_y = w_y + tf.matmul(A, z_x) return z_x, z_y
def safe_chol(A,RHS): chol=tf.cholesky(condition(A)) return tf.cholesky_solve(chol,RHS)