def build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. Note that this is very similar to the SGPR prediction, for whcih there are notes in the SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] psi0, psi1, psi2 = ke.build_psi_stats(self.Z, self.kern, self.X_mean, self.X_var) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 Kus = self.kern.K(self.Z, Xnew) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.matrix_triangular_solve(L, psi2, lower=True) AAT = tf.matrix_triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + eye(num_inducing) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, self.Y), lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def build_predict(self, Xnew , full_cov=False): err = self.Y Kuf = self.RBF(self.Z, self.X) Kuu = self.RBF(self.Z,self.Z) + eye(self.num_inducing) * 1e-6 Kus = self.RBF(self.Z, Xnew) sigma = tf.sqrt(self.likelihood_variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, tf.transpose(A)) + eye(num_inducing) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.RBF(Xnew, Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.RBF(Xnew, Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean , var
def build_likelihood(self): """ Constuct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] num_data = tf.shape(self.Y)[0] output_dim = tf.shape(self.Y)[1] err = self.Y - self.mean_function(self.X) Kdiag = self.kern.Kdiag(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 L = tf.cholesky(Kuu) # Compute intermediate matrices A = tf.matrix_triangular_solve(L, Kuf, lower=True)*tf.sqrt(1./self.likelihood.variance) AAT = tf.matmul(A, tf.transpose(A)) B = AAT + eye(num_inducing) LB = tf.cholesky(B) c = tf.matrix_triangular_solve(LB, tf.matmul(A, err), lower=True) * tf.sqrt(1./self.likelihood.variance) #compute log marginal bound bound = -0.5*tf.cast(num_data*output_dim, tf.float64)*np.log(2*np.pi) bound += -tf.cast(output_dim, tf.float64)*tf.reduce_sum(tf.log(tf.user_ops.get_diag(LB))) bound += -0.5*tf.cast(num_data*output_dim, tf.float64)*tf.log(self.likelihood.variance) bound += -0.5*tf.reduce_sum(tf.square(err))/self.likelihood.variance bound += 0.5*tf.reduce_sum(tf.square(c)) bound += -0.5*(tf.reduce_sum(Kdiag)/self.likelihood.variance - tf.reduce_sum(tf.user_ops.get_diag(AAT))) return bound
def testNonSquareMatrix(self): with self.assertRaises(ValueError): tf.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]])) with self.assertRaises(ValueError): tf.cholesky( np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]] ]))
def build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = tf.shape(self.Z)[0] err = self.Y - self.mean_function(self.X) Kuf = self.kern.K(self.Z, self.X) Kuu = self.kern.K(self.Z) + eye(num_inducing) * 1e-6 Kus = self.kern.K(self.Z, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, tf.transpose(A)) + eye(num_inducing) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tf.transpose(tmp2), c) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tf.transpose(tmp2), tmp2)\ - tf.matmul(tf.transpose(tmp1), tmp1) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0)\ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.pack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def compute_upper_bound(self): num_data = tf.cast(tf.shape(self.Y)[0], settings.float_type) Kdiag = self.kern.Kdiag(self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kuf = self.feature.Kuf(self.kern, self.X) L = tf.cholesky(Kuu) LB = tf.cholesky(Kuu + self.likelihood.variance ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) LinvKuf = tf.matrix_triangular_solve(L, Kuf, lower=True) # Using the Trace bound, from Titsias' presentation c = tf.reduce_sum(Kdiag) - tf.reduce_sum(LinvKuf ** 2.0) # Kff = self.kern.K(self.X) # Qff = tf.matmul(Kuf, LinvKuf, transpose_a=True) # Alternative bound on max eigenval: # c = tf.reduce_max(tf.reduce_sum(tf.abs(Kff - Qff), 0)) corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.log(2 * np.pi * self.likelihood.variance) logdet = tf.reduce_sum(tf.log(tf.diag_part(L))) - tf.reduce_sum(tf.log(tf.diag_part(LB))) LC = tf.cholesky(Kuu + corrected_noise ** -1.0 * tf.matmul(Kuf, Kuf, transpose_b=True)) v = tf.matrix_triangular_solve(LC, corrected_noise ** -1.0 * tf.matmul(Kuf, self.Y), lower=True) quad = -0.5 * corrected_noise ** -1.0 * tf.reduce_sum(self.Y ** 2.0) + 0.5 * tf.reduce_sum(v ** 2.0) return const + logdet + quad
def gauss_kl(min_q_mu, q_sq,K): q_mu=-1*min_q_mu #q_sqrt=tf.cholesky(tf.squeeze(q_sqrt)) # K is a variance...we sqrt later ''' N=1 Q=5 q_mu=tf.random_normal([Q,1],dtype=tf.float64) q_var=tf.random_normal([Q,Q],dtype=tf.float64) q_var=q_var+tf.transpose(q_var [1,0])+1e+1*np.eye(Q) K=q_var q_sqrt=tf.cholesky(q_var) q_sqrt=tf.expand_dims(q_sqrt,-1) num_latent=1 s=tf.Session() s.run(tf.initialize_all_variables()) ''' """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). q_sqrt=tf.cholesky(K) L = tf.cholesky(q_sq) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0], tf.float64) Lq = tf.batch_matrix_band_part(q_sqrt, -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term """ V2=tf.cholesky(K) V1=tf.cholesky(q_sq) KL=h.Mul(tf.transpose(q_mu),tf.cholesky_solve(V2,q_mu)) KL+=tf.trace(tf.cholesky_solve(V2,q_sq)) KL-=h.get_dim(K,0) KL+=tf.reduce_sum(2*tf.log(tf.diag_part(V2))-2*tf.log(tf.diag_part(V1))) return KL/2
def F_bound2_v2(y,S,Kmm,Knm,Kmnnm,Tr_Knn,sigma): #matrices to be used N=get_dim(y,0) Kmm_chol=tf.cholesky(Kmm) Q_nn=tf.square(sigma)*np.eye(N)+Mul(Knm,tf.cholesky_solve(Kmm_chol,tf.transpose(Knm))) bound=-0.5*(Tr_Knn-tf.trace(tf.cholesky_solve(Kmm_chol,Kmnnm)))/tf.square(sigma) bound+=multivariate_normal(y, tf.zeros([N,1],dtype=tf.float32), tf.cholesky(Q_nn)) return bound
def log_det(Z): #conditioned=condition(Z) Z=(Z+tf.transpose(Z))/2 return 2*tf.reduce_sum(tf.log(tf.diag_part(tf.cholesky(Z)))) chol=tf.cholesky(Z) logdet=2*tf.reduce_sum(tf.log(tf.diag_part(chol))) return logdet
def natural_to_meanvarsqrt(nat_1, nat_2): var_sqrt_inv = tf.cholesky(-2 * nat_2) var_sqrt = _inverse_lower_triangular(var_sqrt_inv) S = tf.matmul(var_sqrt, var_sqrt, transpose_a=True) mu = tf.matmul(S, nat_1) # We need the decomposition of S as L L^T, not as L^T L, # hence we need another cholesky. return mu, tf.cholesky(S)
def multivariate_gaussian_log_density(x, mu, Sigma=None, L=None, prec=None, L_prec=None): """ Assume X is a single vector described by a multivariate Gaussian distribution with x ~ N(mu, Sigma). We accept parameterization in terms of the covariance matrix or its cholesky decomposition L (more efficient if available), or the precision matrix or its cholesky decomposition L_prec. The latter is useful when representing a Gaussian in its natural parameterization. Note that we still require the explicit mean mu (not the natural parameter prec*mu) since I'm too lazy to cover all the permutations of possible arguments (though this should be straightforward). """ s = extract_shape(x) try: n, = s except: n, m = s assert(m==1) if L is None and Sigma is not None: L = tf.cholesky(Sigma) if L_prec is None and prec is not None: L_prec = tf.cholesky(prec) if L is not None: neg_half_logdet = -tf.reduce_sum(tf.log(tf.diag_part(L))) else: assert(L_prec is not None) neg_half_logdet = tf.reduce_sum(tf.log(tf.diag_part(L_prec))) d = tf.reshape(x - mu, (n,1)) if L is not None: alpha = tf.matrix_triangular_solve(L, d, lower=True) exponential_part= tf.reduce_sum(tf.square(alpha)) elif prec is not None: d = tf.reshape(d, (n, 1)) exponential_part = tf.reduce_sum(d * tf.matmul(prec, d)) else: assert(L_prec is not None) d = tf.reshape(d, (1, n)) alpha = tf.matmul(d, L_prec) exponential_part= tf.reduce_sum(tf.square(alpha)) n_log2pi = n * 1.83787706641 logp = -0.5 * n_log2pi logp += neg_half_logdet logp += -0.5 * exponential_part return logp
def Bound1(y,S,Kmm,Knm,Tr_Knn,sigma): #matrices to be used Kmm_chol=tf.cholesky(Kmm) sig_2=tf.square(sigma) N=h.get_dim(y,0) Q_nn=h.Mul(Knm,tf.cholesky_solve(Kmm_chol,tf.transpose(Knm))) Q_I_chol=tf.cholesky(sig_2*np.eye(N)+Q_nn) bound=-0.5*(Tr_Knn-Q_nn)/sig_2 bound+=h.multivariate_normal(y, tf.zeros([N,1],dtype=tf.float32), Q_I_chol) bound-=0.5*tf.reduce_sum(S)/sig_2+0.1*0.5*tf.reduce_sum(tf.log(S)) return bound
def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, self.Y.shape[1]]) return fmean, fvar
def gauss_kl_diag(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and q_sqrt. q_mu is a matrix, each column contains a mean q_sqrt is a matrix, each column represents the diagonal of a square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) KL += -0.5 * tf.reduce_sum(tf.log(tf.square(q_sqrt))) # Log-det of q-cov L_inv = tf.matrix_triangular_solve(L, eye(tf.shape(L)[0]), lower=True) K_inv = tf.matrix_triangular_solve(tf.transpose(L), L_inv, lower=False) KL += 0.5 * tf.reduce_sum(tf.expand_dims(tf.diag_part(K_inv), 1) * tf.square(q_sqrt)) # Trace term. return KL
def _build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(tf.eye(self.num_data, dtype=settings.float_type), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda), 1) f_var = 1. / tf.square(self.q_lambda) - tf.transpose(tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha * self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def initialize(self, *args, **kwargs): # Store latent variables in a temporary attribute; MAP will # optimize `PointMass` random variables, which subsequently # optimizes mean parameters of the normal approximations. latent_vars_normal = self.latent_vars.copy() self.latent_vars = {z: PointMass(params=qz.loc) for z, qz in six.iteritems(latent_vars_normal)} super(Laplace, self).initialize(*args, **kwargs) hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars))) self.finalize_ops = [] for z, hessian in zip(six.iterkeys(self.latent_vars), hessians): qz = latent_vars_normal[z] if isinstance(qz, (MultivariateNormalDiag, Normal)): scale_var = get_variables(qz.variance())[0] scale = 1.0 / tf.diag_part(hessian) else: # qz is MultivariateNormalTriL scale_var = get_variables(qz.covariance())[0] scale = tf.matrix_inverse(tf.cholesky(hessian)) self.finalize_ops.append(scale_var.assign(scale)) self.latent_vars = latent_vars_normal.copy() del latent_vars_normal
def main(_): ed.set_seed(42) # MODEL z = MultivariateNormalTriL( loc=tf.ones(2), scale_tril=tf.cholesky(tf.constant([[1.0, 0.8], [0.8, 1.0]]))) # INFERENCE qz = Empirical(params=tf.get_variable("qz/params", [1000, 2])) inference = ed.HMC({z: qz}) inference.run() # CRITICISM sess = ed.get_session() mean, stddev = sess.run([qz.mean(), qz.stddev()]) print("Inferred posterior mean:") print(mean) print("Inferred posterior stddev:") print(stddev) fig, ax = plt.subplots() trace = sess.run(qz.params) ax.scatter(trace[:, 0], trace[:, 1], marker=".") mvn_plot_contours(z, ax=ax) plt.show()
def gauss_kl(q_mu, q_sqrt, K): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume multiple independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type) KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type) # constant term Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # force lower triangle KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq)))) # logdet L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1])) LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def runFiniteDifferences(self, shapes, dtypes=(tf.float32, tf.float64), scalarTest=False): with self.test_session(use_gpu=False): for shape in shapes: for batch in False, True: for dtype in dtypes: if not scalarTest: x = tf.constant(np.random.randn(shape[0], shape[1]), dtype) tensor = tf.matmul(x, tf.transpose(x)) / shape[0] else: # This is designed to be a faster test for larger matrices. x = tf.constant(np.random.randn(), dtype) R = tf.constant(np.random.randn(shape[0], shape[1]), dtype) e = tf.mul(R, x) tensor = tf.matmul(e, tf.transpose(e)) / shape[0] # Inner-most matrices in tensor are positive definite. if batch: tensor = tf.tile(tf.expand_dims(tensor, 0), [4, 1, 1]) y = tf.cholesky(tensor) if scalarTest: y = tf.reduce_mean(y) error = tf.test.compute_gradient_error(x, x._shape_as_list(), y, y._shape_as_list()) tf.logging.info("error = %f", error) if dtype == tf.float64: self.assertLess(error, 1e-5) else: self.assertLess(error, 3e-3)
def test_whiten(self): """ make sure that predicting using the whitened representation is the sameas the non-whitened one. """ with self.test_context() as sess: rng = np.random.RandomState(0) Xs, X, F, k, num_data, feed_dict = self.prepare() k.compile(session=sess) F_sqrt = tf.placeholder(settings.float_type, [num_data, 1]) F_sqrt_data = rng.rand(num_data, 1) feed_dict[F_sqrt] = F_sqrt_data K = k.K(X) L = tf.cholesky(K) V = tf.matrix_triangular_solve(L, F, lower=True) V_sqrt = tf.matrix_triangular_solve(L, tf.diag(F_sqrt[:, 0]), lower=True)[None, :, :] Fstar_mean, Fstar_var = gpflow.conditionals.conditional( Xs, X, k, F, q_sqrt=F_sqrt) Fstar_w_mean, Fstar_w_var = gpflow.conditionals.conditional( Xs, X, k, V, q_sqrt=V_sqrt, white=True) mean_difference = sess.run(Fstar_w_mean - Fstar_mean, feed_dict=feed_dict) var_difference = sess.run(Fstar_w_var - Fstar_var, feed_dict=feed_dict) assert_allclose(mean_difference, 0, atol=4) assert_allclose(var_difference, 0, atol=4)
def gauss_kl(q_mu, q_sqrt, K, num_latent): """ Compute the KL divergence from q(x) = N(q_mu, q_sqrt^2) to p(x) = N(0, K) We assume num_latent independent distributions, given by the columns of q_mu and the last dimension of q_sqrt. q_mu is a matrix, each column contains a mean. q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root matrix of the covariance of q. K is a positive definite matrix: the covariance of p. num_latent is an integer: the number of independent distributions (equal to the columns of q_mu and the last dim of q_sqrt). """ L = tf.cholesky(K) alpha = tf.matrix_triangular_solve(L, q_mu, lower=True) KL = 0.5 * tf.reduce_sum(tf.square(alpha)) # Mahalanobis term. KL += num_latent * 0.5 * tf.reduce_sum( tf.log(tf.square(tf.diag_part(L)))) # Prior log-det term. KL += -0.5 * tf.cast(tf.shape(q_sqrt)[0] * num_latent, tf.float64) for d in range(num_latent): Lq = tf.batch_matrix_band_part(q_sqrt[:, :, d], -1, 0) # Log determinant of q covariance: KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.diag_part(Lq)))) LiLq = tf.matrix_triangular_solve(L, Lq, lower=True) KL += 0.5 * tf.reduce_sum(tf.square(LiLq)) # Trace term return KL
def _expectation(p, mean, none, kern, feat, nghp=None): """ Compute the expectation: expectation[n] = <x_n K_{x_n, Z}>_p(x_n) - K_{.,.} :: RBF kernel :return: NxDxM """ Xmu, Xcov = p.mu, p.cov with tf.control_dependencies([tf.assert_equal( tf.shape(Xmu)[1], tf.constant(kern.input_dim, settings.tf_int), message="Currently cannot handle slicing in exKxz.")]): Xmu = tf.identity(Xmu) with params_as_tensors_for(kern), params_as_tensors_for(feat): D = tf.shape(Xmu)[1] lengthscales = kern.lengthscales if kern.ARD \ else tf.zeros((D,), dtype=settings.float_type) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(feat.Z) - tf.expand_dims(Xmu, 2) # NxDxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N exponent_mahalanobis = tf.cholesky_solve(chol_L_plus_Xcov, all_diffs) # NxDxM non_exponent_term = tf.matmul(Xcov, exponent_mahalanobis, transpose_a=True) non_exponent_term = tf.expand_dims(Xmu, 2) + non_exponent_term # NxDxM exponent_mahalanobis = tf.reduce_sum(all_diffs * exponent_mahalanobis, 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM return kern.variance * (determinants[:, None] * exponent_mahalanobis)[:, None, :] * non_exponent_term
def build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower lound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) f_mean = tf.matmul(K, self.q_alpha) + self.mean_function(self.X) #for each of the data-dimensions (columns of Y), find the diagonal of the #variance, and also relevant parts of the KL. f_var, A_logdet, trAi = [], tf.zeros((1,), tf.float64), tf.zeros((1,), tf.float64) for d in range(self.num_latent): b = self.q_lambda[:,d] B = tf.expand_dims(b, 1) A = eye(self.num_data) + K*B*tf.transpose(B) L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, eye(self.num_data), lower=True) LiBi = Li / b #full_sigma:return tf.diag(b**-2) - LiBi.T.dot(LiBi) f_var.append(1./tf.square(b) - tf.reduce_sum(tf.square(LiBi),0)) A_logdet += 2*tf.reduce_sum(tf.log(tf.user_ops.get_diag(L))) trAi += tf.reduce_sum(tf.square(Li)) f_var = tf.transpose(tf.pack(f_var)) KL = 0.5*(A_logdet + trAi - self.num_data*self.num_latent + tf.reduce_sum(f_mean*self.q_alpha)) return tf.reduce_sum(self.likelihood.variational_expectations(f_mean, f_var, self.Y)) - KL
def __init__(self, mean, cov, d=None): mean = tf.convert_to_tensor(mean) cov = tf.convert_to_tensor(cov) try: d1, = util.extract_shape(mean) mean = tf.reshape(mean, (d1,1)) except: d1,k = util.extract_shape(mean) assert(k == 1) d2,_ = util.extract_shape(cov) assert(d1==d2) if d is None: d = d1 else: assert(d==d1) super(MVGaussianMeanCov, self).__init__(d=d) self._mean = mean self._cov = cov self._L_cov = tf.cholesky(cov) self._entropy = bf.dists.multivariate_gaussian_entropy(L=self._L_cov) L_prec_transpose = util.triangular_inv(self._L_cov) self._L_prec = tf.transpose(L_prec_transpose) self._prec = tf.matmul(self._L_prec, L_prec_transpose) self._prec_mean = tf.matmul(self._prec, self._mean)
def gp_predict_whitened(Xnew, X, kern, V): """ Given a whitened representation of the GP at the points X (V), produce the mean and variance of the GP at the points Xnew (F*). The GP has been centered (whitened) so that p(v) = N( 0, I) f = L v , and so p(f) = N(0, LL^T) = N(0, K). We assume K independent GPs, represented by the columns of V. The GP conditional is: p(F*[:,i] | V[:,i]) = N (K_{*f} L^{-T} V[:,i], K_{**} - K_{*f}L^{-1} L^{-T} K_{f*}) Xnew is a data matrix, size N* x D X is a data matrix, size N x D V is a matrix containing whitened GP values, size N x K See also: gaussian_gp_predict_whitened -- where there is no uncertainty in V gp_predict -- same, without the whitening """ Kd = kern.Kdiag(Xnew) Kx = kern.K(X, Xnew) K = kern.K(X) L = tf.cholesky(K) A = tf.user_ops.triangular_solve(L, Kx, 'lower') fmean = tf.matmul(tf.transpose(A), V) fvar = Kd - tf.reduce_sum(tf.square(A), 0) return fmean, tf.expand_dims(fvar, 1) * tf.ones_like(V[0,:])
def _build_predict(self, Xnew, full_cov=False): """ The posterior variance of F is given by q(f) = N(f | K alpha + mean, [K^-1 + diag(lambda**2)]^-1) Here we project this to F*, the values of the GP at Xnew which is given by q(F*) = N ( F* | K_{*F} alpha + mean, K_{**} - K_{*f}[K_{ff} + diag(lambda**-2)]^-1 K_{f*} ) """ # compute kernel things Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) # predictive mean f_mean = tf.matmul(Kx, self.q_alpha, transpose_a=True) + self.mean_function(Xnew) # predictive var A = K + tf.matrix_diag(tf.transpose(1. / tf.square(self.q_lambda))) L = tf.cholesky(A) Kx_tiled = tf.tile(tf.expand_dims(Kx, 0), [self.num_latent, 1, 1]) LiKx = tf.matrix_triangular_solve(L, Kx_tiled) if full_cov: f_var = self.kern.K(Xnew) - tf.matmul(LiKx, LiKx, transpose_a=True) else: f_var = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(LiKx), 1) return f_mean, tf.transpose(f_var)
def _expectation(p, kern, feat, none1, none2, nghp=None): """ Compute the expectation: <K_{X, Z}>_p(X) - K_{.,.} :: RBF kernel :return: NxM """ with params_as_tensors_for(kern), params_as_tensors_for(feat): # use only active dimensions Xcov = kern._slice_cov(p.cov) Z, Xmu = kern._slice(feat.Z, p.mu) D = tf.shape(Xmu)[1] if kern.ARD: lengthscales = kern.lengthscales else: lengthscales = tf.zeros((D,), dtype=settings.tf_float) + kern.lengthscales chol_L_plus_Xcov = tf.cholesky(tf.matrix_diag(lengthscales ** 2) + Xcov) # NxDxD all_diffs = tf.transpose(Z) - tf.expand_dims(Xmu, 2) # NxDxM exponent_mahalanobis = tf.matrix_triangular_solve(chol_L_plus_Xcov, all_diffs, lower=True) # NxDxM exponent_mahalanobis = tf.reduce_sum(tf.square(exponent_mahalanobis), 1) # NxM exponent_mahalanobis = tf.exp(-0.5 * exponent_mahalanobis) # NxM sqrt_det_L = tf.reduce_prod(lengthscales) sqrt_det_L_plus_Xcov = tf.exp(tf.reduce_sum(tf.log(tf.matrix_diag_part(chol_L_plus_Xcov)), axis=1)) determinants = sqrt_det_L / sqrt_det_L_plus_Xcov # N return kern.variance * (determinants[:, None] * exponent_mahalanobis)
def build_predict(self, Xnew, full_cov=False): """ The posterior variance of F is given by q(f) = N(f | K alpha, [K^-1 + diag(lambda**2)]^-1) Here we project this to F*, the values of the GP at Xnew which is given by q(F*) = N ( F* | K_{*F} alpha , K_{**} - K_{*f}[K_{ff} + diag(lambda**-2)]^-1 K_{f*} ) """ #compute kernelly things Kx = self.kern.K(Xnew, self.X) K = self.kern.K(self.X) #predictive mean f_mean = tf.matmul(Kx, self.q_alpha) + self.mean_function(Xnew) #predictive var f_var = [] for d in range(self.num_latent): b = self.q_lambda[:,d] A = K + tf.diag(1./tf.square(b)) L = tf.cholesky(A) LiKx = tf.matrix_triangular_solve(L, tf.transpose(Kx), lower=True) if full_cov: f_var.append( self.kern.K(Xnew)- tf.matmul(tf.transpose(LiKx),LiKx) ) else: f_var.append( self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(LiKx),0) ) f_var = tf.pack(f_var) return f_mean, tf.transpose(f_var)
def logpdf(self, x, mean=None, cov=1): """Log of the probability density function. Parameters ---------- x : tf.Tensor A 1-D or 2-D tensor. mean : tf.Tensor, optional A 1-D tensor. Defaults to zero mean. cov : tf.Tensor, optional A 1-D or 2-D tensor. Defaults to identity matrix. Returns ------- tf.Tensor A tensor of one dimension less than the input. """ x = tf.cast(x, dtype=tf.float32) x_shape = get_dims(x) if len(x_shape) == 1: d = x_shape[0] else: d = x_shape[1] if mean is None: r = x else: mean = tf.cast(mean, dtype=tf.float32) r = x - mean if cov is 1: L_inv = tf.diag(tf.ones([d])) det_cov = tf.constant(1.0) else: cov = tf.cast(cov, dtype=tf.float32) if len(cov.get_shape()) == 1: # vector L_inv = tf.diag(1.0 / tf.sqrt(cov)) det_cov = tf.reduce_prod(cov) else: # matrix L = tf.cholesky(cov) L_inv = tf.matrix_inverse(L) det_cov = tf.pow(tf.reduce_prod(tf.diag_part(L)), 2) lps = -0.5*d*tf.log(2*np.pi) - 0.5*tf.log(det_cov) if len(x_shape) == 1: # vector r = tf.reshape(r, shape=(d, 1)) inner = tf.matmul(L_inv, r) lps -= 0.5 * tf.matmul(inner, inner, transpose_a=True) return tf.squeeze(lps) else: # matrix # TODO vectorize further out = [] for r_vec in tf.unpack(r): r_vec = tf.reshape(r_vec, shape=(d, 1)) inner = tf.matmul(L_inv, r_vec) out += [tf.squeeze(lps - 0.5 * tf.matmul(inner, inner, transpose_a=True))] return tf.pack(out)
def _grad_and_hessian_loss_fn(x): loss = _neg_log_likelihood(x) grad_loss = tf.gradients(loss, [x])[0] hessian_loss = tf.hessians(loss, [x])[0] hessian_chol = tf.cholesky(hessian_loss) return grad_loss, hessian_chol, tf.ones_like(grad_loss)
varianceM52_pre = tf.Variable(np.log(np.exp(0.1) - 1), dtype=tf.float32) lengthscaleM52 = tf.nn.softplus(lengthscaleM52_pre) varianceM52 = tf.nn.softplus(varianceM52_pre) period_pre = tf.Variable(np.log(np.exp(7.0 * len_init) - 1), dtype=tf.float32) period_len_pre = tf.Variable(1.0) period_var_pre = tf.Variable(np.log(np.exp(0.5) - 1), dtype=tf.float32) # period = tf.nn.softplus(period_pre) period_length = tf.nn.softplus(period_len_pre) Kuu = kernelfx(xu, xu) fu_loc = tf.zeros((p, m)) fu_scale = tf.cast(tf.cholesky(Kuu + offset * tf.eye(m, dtype=tf.float64), name='fu_scale'), dtype=tf.float32) u = MultivariateNormalTriL(loc=fu_loc, scale_tril=fu_scale, name='pu') x = Normal(loc=tf.zeros((M, Q)), scale=1.0) Kfu = kernelfx(x, xu) Kff = kernelfx(x, x) Kuuinv = tf.matrix_inverse(Kuu + offset * tf.eye(m, dtype=tf.float64)) KfuKuuinv = tf.matmul(Kfu, Kuuinv) KffKuuinvU = [ tf.reshape( tf.matmul(KfuKuuinv, tf.expand_dims(tf.cast(u[i], dtype=tf.float64), axis=1)),
def testNonSquareMatrix(self): with self.assertRaises(ValueError): tf.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
def independent_interdomain_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False): """ The inducing outputs live in the g-space (R^L). Interdomain conditional calculation. :param Kmn: M x L x N x P :param Kmm: L x M x M :param Knn: N x P or N x N or P x N x N or N x P x N x P :param f: data matrix, M x L :param q_sqrt: L x M x M or M x L :param full_cov: calculate covariance between inputs :param full_output_cov: calculate covariance between outputs :param white: use whitened representation :return: - mean: N x P - variance: N x P, N x P x P, P x N x N, N x P x N x P """ logger.debug("independent_interdomain_conditional") M, L, N, P = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)] Lm = tf.cholesky(Kmm) # L x M x M # Compute the projection matrix A Kmn = tf.reshape(tf.transpose(Kmn, (1, 0, 2, 3)), (L, M, N * P)) A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # L x M x M * L x M x NP -> L x M x NP Ar = tf.reshape(A, (L, M, N, P)) # compute the covariance due to the conditioning if full_cov and full_output_cov: fvar = Knn - tf.tensordot(Ar, Ar, [[0, 1], [0, 1]]) # N x P x N x P elif full_cov and not full_output_cov: At = tf.reshape(tf.transpose(Ar), (P, N, M * L)) # P x N x ML fvar = Knn - tf.matmul(At, At, transpose_b=True) # P x N x N elif not full_cov and full_output_cov: At = tf.reshape(tf.transpose(Ar, [2, 3, 1, 0]), (N, P, M * L)) # N x P x ML fvar = Knn - tf.matmul(At, At, transpose_b=True) # N x P x P elif not full_cov and not full_output_cov: fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0, 1]), (N, P)) # Knn: N x P # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(Lm, Ar) # L x M x M * L x M x NP -> L x M x NP Ar = tf.reshape(A, (L, M, N, P)) fmean = tf.tensordot(Ar, f, [[1, 0], [0, 1]]) # N x P if q_sqrt is not None: if q_sqrt.shape.ndims == 3: Lf = tf.matrix_band_part(q_sqrt, -1, 0) # L x M x M LTA = tf.matmul(Lf, A, transpose_a=True) # L x M x M * L x M x NP -> L x M x NP else: # q_sqrt M x L LTA = (A * tf.transpose(q_sqrt)[..., None]) # L x M x NP if full_cov and full_output_cov: LTAr = tf.reshape(LTA, (L * M, N * P)) fvar = fvar + tf.reshape(tf.matmul(LTAr, LTAr, transpose_a=True), (N, P, N, P)) elif full_cov and not full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [2, 0, 1]) # P x LM x N fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True) # P x N x N elif not full_cov and full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [1, 0, 2]) # N x LM x P fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True) # N x P x P elif not full_cov and not full_output_cov: fvar = fvar + tf.reshape(tf.reduce_sum(tf.square(LTA), (0, 1)), (N, P)) return fmean, fvar
import tensorflow as tf from tensorflow.python.framework import ops ops.reset_default_graph() sess = tf.Session() x_vals = np.linspace(0, 10, 100) y_vals = x_vals + np.random.normal(0, 1, 100) x_vals_column = np.transpose(np.matrix(x_vals)) ones_column = np.transpose(np.matrix(np.repeat(1, 100))) A = np.column_stack((x_vals_column, ones_column)) b = np.transpose(np.matrix(y_vals)) A_tensor = tf.constant(A) b_tensor = tf.constant(b) # print(A_tensor) tA_A = tf.matmul(tf.transpose(A_tensor), A_tensor) L = tf.cholesky(tA_A) tA_b = tf.matmul(tf.transpose(A_tensor), b) sol1 = tf.matrix_solve(L, tA_b) sol2 = tf.matrix_solve(tf.transpose(L), sol1) solution_eval = sess.run(sol2) slope = solution_eval[0][0] y_intercept = solution_eval[1][0] print('slope = ' + str(slope)) print('y_intercept = ' + str(y_intercept)) best_fit = [] for i in x_vals: best_fit.append(slope * i + y_intercept)
def build_model(self): """Defines the GP model. The loss is computed for partial feedback settings (bandits), so only the observed outcome is backpropagated (see weighted loss). Selects the optimizer and, finally, it also initializes the graph. """ LOGGING.info("Initializing model %s.", self.name) self.global_step = tf.train.get_or_create_global_step() # Define state for the model (inputs, etc.) self.x_train = tf.get_variable( "training_data", initializer=tf.ones( [self.hparams.batch_size, self.n_in], dtype=tf.float64), validate_shape=False, trainable=False) self.y_train = tf.get_variable( "training_labels", initializer=tf.zeros([self.hparams.batch_size, 1], dtype=tf.float64), validate_shape=False, trainable=False) self.weights_train = tf.get_variable( "weights_train", initializer=tf.ones( [self.hparams.batch_size, self.n_out], dtype=tf.float64), validate_shape=False, trainable=False) self.input_op = tf.assign(self.x_train, self.x_in, validate_shape=False) self.input_w_op = tf.assign( self.weights_train, self.weights, validate_shape=False) self.input_std = tf.get_variable( "data_standard_deviation", initializer=tf.ones([1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=False) self.input_mean = tf.get_variable( "data_mean", initializer=tf.zeros([1, self.n_out], dtype=tf.float64), dtype=tf.float64, trainable=True) # GP Hyperparameters self.noise = tf.get_variable( "noise", initializer=tf.cast(0.0, dtype=tf.float64)) self.amplitude = tf.get_variable( "amplitude", initializer=tf.cast(1.0, dtype=tf.float64)) self.amplitude_linear = tf.get_variable( "linear_amplitude", initializer=tf.cast(1.0, dtype=tf.float64)) self.length_scales = tf.get_variable( "length_scales", initializer=tf.zeros([1, self.n_in], dtype=tf.float64)) self.length_scales_lin = tf.get_variable( "length_scales_linear", initializer=tf.zeros([1, self.n_in], dtype=tf.float64)) # Latent embeddings of the different outputs for task covariance self.task_vectors = tf.get_variable( "latent_task_vectors", initializer=tf.random_normal( [self.n_out, self.task_latent_dim], dtype=tf.float64)) # Normalize outputs across each dimension # Since we have different numbers of observations across each task, we # normalize by their respective counts. index_counts = self.atleast_2d(tf.reduce_sum(self.weights, axis=0), self.n_out) index_counts = tf.where(index_counts > 0, index_counts, tf.ones(tf.shape(index_counts), dtype=tf.float64)) self.mean_op = tf.assign(self.input_mean, tf.reduce_sum(self.y, axis=0) / index_counts) self.var_op = tf.assign( self.input_std, tf.sqrt(1e-4 + tf.reduce_sum(tf.square( self.y - tf.reduce_sum(self.y, axis=0) / index_counts), axis=0) / index_counts)) with tf.control_dependencies([self.var_op]): y_normed = self.atleast_2d( (self.y - self.input_mean) / self.input_std, self.n_out) y_normed = self.atleast_2d(tf.boolean_mask(y_normed, self.weights > 0), 1) self.out_op = tf.assign(self.y_train, y_normed, validate_shape=False) # Observation noise alpha = tf.nn.softplus(self.noise) + 1e-6 # Covariance with tf.control_dependencies([self.input_op, self.input_w_op, self.out_op]): self.self_cov = (self.cov(self.x_in, self.x_in) * self.task_cov(self.weights, self.weights) + tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64) * alpha) self.chol = tf.cholesky(self.self_cov) self.kinv = tf.cholesky_solve(self.chol, tf.eye(tf.shape(self.x_in)[0], dtype=tf.float64)) self.input_inv = tf.Variable( tf.eye(self.hparams.batch_size, dtype=tf.float64), validate_shape=False, trainable=False) self.input_cov_op = tf.assign(self.input_inv, self.kinv, validate_shape=False) # Log determinant by taking the singular values along the diagonal # of self.chol with tf.control_dependencies([self.input_cov_op]): logdet = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(self.chol) + 1e-16)) # Log Marginal likelihood self.marginal_ll = -tf.reduce_sum(-0.5 * tf.matmul( tf.transpose(y_normed), tf.matmul(self.kinv, y_normed)) - 0.5 * logdet - 0.5 * self.n * np.log(2 * np.pi)) zero = tf.cast(0., dtype=tf.float64) one = tf.cast(1., dtype=tf.float64) standard_normal = tfd.Normal(loc=zero, scale=one) # Loss is marginal likelihood and priors self.loss = tf.reduce_sum( self.marginal_ll - (standard_normal.log_prob(self.amplitude) + standard_normal.log_prob(tf.exp(self.noise)) + standard_normal.log_prob(self.amplitude_linear) + tfd.Normal(loc=zero, scale=one * 10.).log_prob( self.task_vectors)) ) # Optimizer for hyperparameters optimizer = tf.train.AdamOptimizer(learning_rate=self.hparams.lr) vars_to_optimize = [ self.amplitude, self.length_scales, self.length_scales_lin, self.amplitude_linear, self.noise, self.input_mean ] if self.learn_embeddings: vars_to_optimize.append(self.task_vectors) grads = optimizer.compute_gradients(self.loss, vars_to_optimize) self.train_op = optimizer.apply_gradients(grads, global_step=self.global_step) # Predictions for test data self.y_mean, self.y_pred = self.posterior_mean_and_sample(self.x) # create tensorboard metrics self.create_summaries() self.summary_writer = tf.summary.FileWriter("{}/graph_{}".format( FLAGS.logdir, self.name), self.sess.graph) self.check = tf.add_check_numerics_ops()
def _inverse(self, y): return tf.cholesky(y)
def FactorAnalysisMethod(self): ''' Build Graph and execute in here so don't have to pass variables one by one Bad Coding Style but higher programmer productivity ''' trainData = tf.placeholder(tf.float32, shape=[None, self.D], name="trainingData") batchSize = tf.shape(trainData)[0] # Build Graph print "trainShape", self.trainData.shape print "validShape", self.validData.shape print "testShape", self.testData.shape factorMean = tf.Variable(tf.random_normal([1, self.D])) # Cholesky doesn't accept negative weights #factorWeightsConstraint = tf.Variable(tf.random_normal([self.D, self.K])) factorWeights = tf.Variable(tf.random_normal([self.D, self.K])) factorStdDeviationConstraint = tf.Variable(tf.random_normal([self.D])) #factorWeights = tf.exp(factorWeightsConstraint) factorTraceCoVariance = tf.matrix_diag(tf.exp(factorStdDeviationConstraint)) factorCovariance = tf.add(factorTraceCoVariance, tf.matmul(factorWeights, tf.transpose(factorWeights))) #factorTraceCoVariance = tf.exp(factorStdDeviationConstraint) # factorCovariance = tf.add(tf.diag(factorTraceCoVariance), tf.matmul(factorWeights, tf.transpose(factorWeights))) factorCovarianceInv = tf.matrix_inverse(factorCovariance) logDeterminantCovariance = 2.0 * tf.reduce_sum(tf.log(tf.diag_part(tf.cholesky(factorCovariance)))) # Train Loss # xDeductUTranspose = tf.transpose(xDeductU, (0, 2, 1)) # B * D * 1 # Could have used trace here, doesn't make a difference to your calculation xDeductU = tf.subtract(trainData, factorMean) # B * D total = tf.trace(tf.matmul(tf.matmul(xDeductU, factorCovarianceInv), tf.transpose(xDeductU))) ''' # METHOD 1 # TODO: FIXME THIS IS WRONG! MULTIPLYING BATCH SIZE DOESNT FIX THE PROBLEM logProbability = tf.multiply(tf.cast(batchSize, tf.float32), (-self.D * tf.log(2.0 * np.pi) - logDeterminantCovariance))/2.0 logProbability = logProbability + (total/2.0) loss = tf.negative(logProbability) ''' #total = tf.reduce_sum(tf.multiply(tf.multiply(xDeductU, factorCovarianceInv), xDeductUTranspose)) # Calculate log probability for entire batch, [B] # METHOD 2 #factorCovarianceInv = tf.add(tf.expand_dims(tf.matrix_inverse(factorCovariance), 0), tf.zeros((batchSize, 1, 1))) xExpand= tf.expand_dims(xDeductU, 2) # B * D * 1 total = tf.reduce_sum(tf.multiply(tf.reduce_sum(tf.multiply(xExpand, factorCovarianceInv)), xDeductU), [1]) logProbability = (-self.D * tf.log(2.0 * np.pi) - total - logDeterminantCovariance)/2.0 totalLogProbability = tf.reduce_sum(logProbability) # sum over the entire batch loss = tf.negative(totalLogProbability) # ''' train = self.optimizer.minimize(loss) # Session init = tf.global_variables_initializer() sess = tf.InteractiveSession() sess.run(init) currEpoch = 0 minAssignTrain = 0 minAssignValid = 0 centers = 0 xAxis = [] yTrainErr = [] yValidErr = [] yTestErr = [] numUpdate = 0 step = 0 currTrainDataShuffle = self.trainData feedDictV = {trainData: self.validData} feedDictT = {trainData: self.testData} while currEpoch < self.numEpoch: #np.random.shuffle(self.trainData) # Shuffle Batches step = 0 while step*self.miniBatchSize < self.trainData.shape[0]: feedDicts = {trainData: self.trainData[step*self.miniBatchSize:(step+1)*self.miniBatchSize]} _, errTrain = sess.run([train, loss], feed_dict = feedDicts) # Calculate loss without training for validation errValid = sess.run([loss], feed_dict = feedDictV) errTest = sess.run([loss], feed_dict = feedDictT) ''' kara, hahah, heheh, huhuh = sess.run([loss, haha, hehe, huhu], feed_dict = feedDicts) logStdOut("NPPI: " + str(hahah)) logStdOut("logDetCov: " + str(heheh)) logStdOut("totalL: " + str(huhuh)) ''' xAxis.append(numUpdate) yTrainErr.append(errTrain) yValidErr.append(errValid) yTestErr.append(errTest) step += 1 numUpdate += 1 currEpoch += 1 # if currEpoch%10 == 0: logStdOut("e: " + str(currEpoch)) # Calculate everything again without training to ensure randomization is right feedDictsFinal = {trainData: self.trainData} errTrain, paramFactorMean, paramFactorCovariance, paramFactorWeights = sess.run([loss, factorMean, factorCovariance, factorWeights], feed_dict = feedDictsFinal) # Count how many assigned to each class currTrainDataShuffle = self.trainData self.printPlotResults(xAxis, yTrainErr, yValidErr, yTestErr, numUpdate, currTrainDataShuffle, paramFactorMean, paramFactorCovariance, paramFactorWeights)
def build_net(in_dim, n_hidden, data_type, link='exp', total_size=None, var_init=0.01, bw_indiv=1.0, indiv_y_bol=False, kernel='ard', initialse='identity', seed=23, dtype=tf.float32, landmarks=None, avg_label=1.0, **others): #print('avg_label', avg_label) net = Network(in_dim, data_type, n_hidden=n_hidden, link=link, kernel=kernel, var_init=var_init, indiv_bol=indiv_y_bol, dtype=dtype, seed=seed) inputs = net.inputs params = net.params land_size = n_hidden cst = partial(tf.cast, dtype=dtype) # Model parameters initializer = tf.initializers.random_normal( seed=seed, dtype=dtype) # normal initialiser z_initializer = tf.zeros_initializer(dtype=dtype) o_initializer = tf.ones_initializer(dtype=dtype) #initializer = tf.keras.initializers.he_normal(seed=seed) print('bw_indiv', bw_indiv) if initialse == 'identity': triangle_vec = tf.constant(triangular_vec(None, n=land_size), dtype=dtype) elif initialse == 'kernel': if kernel == 'additive': init_kernel = net.kernel(landmarks, landmarks, stddev_ard=bw_indiv[:-2], scale_ard=0.5, stddev_mat=bw_indiv[-2:], scale_mat=0.5, tensorf=False) elif kernel in ['rbf', 'ard']: init_kernel = net.kernel(landmarks, landmarks, stddev=bw_indiv, scale=1.0, tensorf=False) L = np.linalg.cholesky(init_kernel) triangle_vec = tf.constant(triangular_vec(L, n=land_size), dtype=dtype) # Intialise with L = I for safe inversion at start. params['L'] = tf.Variable(triangle_vec, name='L', dtype=dtype) params['mean'] = tf.Variable(avg_label * o_initializer([land_size, 1]), name='mean', dtype=dtype) #tf.Variable(tf.tile(tf.constant([7.0], dtype=dtype), land_size)) #tf.Variable(z_initializer([land_size, 1]), name = 'mean', dtype=dtype) params['prior_mean'] = tf.Variable(z_initializer([1]), name='prior_mean', dtype=dtype) #tf.Variable(tf.constant([7.0], dtype=dtype), name = 'prior_mean', dtype=dtype) #tf.Variable(initializer([1]), name = 'prior_mean', dtype=dtype) if kernel in ['ard', 'additive']: params['log_bw_sq'] = tf.Variable(tf.log( tf.square(tf.constant(bw_indiv, dtype=dtype))), name='log_bw_sq') #params['log_bw_sq'] = tf.log(tf.square(tf.constant(bw_indiv, dtype=dtype)), name = 'log_bw_sq') elif kernel == 'rbf': print('Vary Bandwidth RBF') params['log_bw_sq'] = tf.Variable(tf.log( tf.square(tf.constant(bw_indiv, dtype=dtype))), name='log_bw_sq') #params['log_bw_sq'] = tf.log(tf.square(tf.constant(bw_indiv, dtype=dtype))) n_bags = cst(tf.shape(inputs['sizes'])[0]) n_indiv = cst(tf.shape(inputs['X'])[0]) sigma_sq = tf.exp(params['log_sig_sq']) scale = tf.exp(params['log_scale']) stddev = tf.sqrt(tf.exp(params['log_bw_sq'])) #stddev = tf.Print(stddev, [stddev], message='bw', summarize=18) landmarks = inputs['landmarks'] inputs_int = tf.concat([ tf.constant([0], tf.int32), tf.cumsum(tf.cast(inputs['sizes'], tf.int32)) ], 0) #inputs_int = tf.Print(inputs_int, [inputs_int]) Sigma_term0 = tf.map_fn(fn=lambda k: tf.reduce_sum(scale * net.kernel( inputs['X'][inputs_int[k]:inputs_int[k + 1], :], inputs['X'][inputs_int[k]:inputs_int[k + 1], :], stddev=stddev, scale=1.0)), elems=tf.range(tf.cast(n_bags, dtype=tf.int32)), dtype=dtype) if kernel in ['ard', 'rbf']: k_ww = scale * net.kernel( landmarks, landmarks, stddev=stddev, scale=1.0) k_wz = scale * net.kernel( landmarks, inputs['X'], stddev=stddev, scale=1.0) #K_wz k_zz = scale * net.kernel( inputs['X'], inputs['X'], stddev=stddev, scale=1.0) #Change k_zz #k_ww = tf.Print(k_ww, [k_ww], message='k_ww', summarize=100) #k_wz = tf.Print(k_wz, [k_wz], message='k_wz', summarize=100) #k_zz = tf.Print(k_zz, [k_zz], message='k_zz', summarize=100) #k_wz = tf.Print(k_wz, [k_wz]) term_0_diag = scale * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) #k_zz diagonal elif kernel == 'additive': scale_mat = tf.exp(params['log_scale_m']) k_ww = net.kernel(landmarks, landmarks, stddev_ard=stddev[:-2], scale_ard=scale, stddev_mat=stddev[-2:], scale_mat=scale_mat) k_wz = net.kernel(landmarks, inputs['X'], stddev_ard=stddev[:-2], scale_ard=scale, stddev_mat=stddev[-2:], scale_mat=scale_mat) #term_0_diag = (scale + scale_mat) * tf.ones([tf.cast(n_indiv, dtype=tf.int32)], dtype=dtype) #k_zz diagonal # SLOW: Compute full kernel matrix and then pool pool then take diag. #Sigma_term0 = tf.diag_part(net.bag_pool(tf.transpose(net.bag_pool(k_zz)))) #Sigma_term0 = tf.Print(Sigma_term0, [Sigma_term0, net.bag_pool(k_zz), net.bag_pool(tf.transpose(net.bag_pool(k_zz)))], message='Sigma0', summarize=1000) #Sigma_term0 = tf.Print(Sigma_term0, [Sigma_term0, batch_items], summarize=100) chol_k = tf.cholesky(k_ww) k_ww_inv = tf.matrix_inverse(k_ww) # K_ww^-1 triangular = fill_triangular(params['L']) #\Sigma_u=LL^T Sigma_u = tf.matmul(triangular, tf.transpose(triangular)) # Sigma_u = L L^T pool_kzw = net.bag_pool(tf.transpose(k_wz)) #pool_kzw = tf.Print(pool_kzw, [tf.transpose(k_wz), pool_kzw], message='pool_kzw', summarize=100) pool_k_zw_k_ww_inv = tf.matmul(pool_kzw, k_ww_inv) kw_zw_k_ww_inv = tf.matmul(tf.transpose(k_wz), k_ww_inv) #pool_k_zw_k_ww_inv = tf.Print(pool_k_zw_k_ww_inv, [tf.matmul(tf.matmul(tf.transpose(k_wz), k_ww_inv), k_wz)], summarize=100, message='sum') #Sigma_term1_check = tf.diag_part(tf.matmul(pool_k_zw_k_ww_inv, tf.transpose(pool_kzw))) # Check this: transpose latter and elementwise multiply, sum across axis=1 Sigma_term1 = tf.reduce_sum(tf.multiply(pool_k_zw_k_ww_inv, pool_kzw), axis=1) #Sigma_term1 = tf.Print(Sigma_term1, [Sigma_term1, Sigma_term1_check], message='Sigma_term1') pool_k_zw_k_ww_inv_Sig_u = tf.matmul(pool_k_zw_k_ww_inv, Sigma_u) #pool_k_zw_k_ww_inv = tf.Print(pool_k_zw_k_ww_inv, [tf.matmul(tf.matmul(kw_zw_k_ww_inv, Sigma_u), tf.transpose(kw_zw_k_ww_inv))], summarize=100, message='sum_2') #Sigma_term2_check = tf.diag_part(tf.matmul(pool_k_zw_k_ww_inv_Sig_u, tf.transpose(pool_k_zw_k_ww_inv))) # Check this: transpose latter and elementwise multiply, sum across axis=1 Sigma_term2 = tf.reduce_sum(tf.multiply(pool_k_zw_k_ww_inv_Sig_u, pool_k_zw_k_ww_inv), axis=1) #Sigma_term2 = tf.Print(Sigma_term2, [Sigma_term2, Sigma_term2_check], message='Sigma_term2') Sigma_sum_term = Sigma_term0 - Sigma_term1 + Sigma_term2 #Sigma_sum_term = tf.Print(Sigma_sum_term, [Sigma_term0, Sigma_term1, Sigma_term2]) k_inv_k_wz = tf.matmul(k_ww_inv, k_wz) # K_ww^-1 K_wz mean_diff = params['mean'] - params['prior_mean'] #mean_diff = tf.Print(mean_diff, [tf.shape(mean_diff)], message='mean_diff') net.mu = mu = params['prior_mean'] + tf.squeeze( tf.matmul(tf.transpose(k_inv_k_wz), mean_diff)) # mu_prior + K_zw K_ww^-1 (mu_u - mu_prior) mu_pool = tf.squeeze(net.bag_pool(tf.expand_dims(mu, 1))) # 1^T mu [bags] #mu_pool = tf.Print(mu_pool, [mu_pool, mu], message='mu_pool') term_1_0 = tf.square(inputs['y']) #sum_j y_j^2 term_1_1 = 2.0 * tf.multiply(inputs['y'], mu_pool) # 2 * sum_j(y_j *1^T mu) term_1_2 = Sigma_sum_term # 1^T S 1 term_1_3 = tf.square( mu_pool) # \sum_j 1^T mu_j mu_j^t 1 = \sum_j (mu_j^t 1)^2 # Term 1 #sigma_sq = tf.Print(sigma_sq, [sigma_sq], 'sigma^2') bag_sigma_sq = sigma_sq * inputs['sizes'] #bag_sigma_sq = tf.Print(bag_sigma_sq, [bag_sigma_sq, term_1_0, term_1_1, term_1_2, term_1_3], message='bag_sigma_sq') term_1_rescale = tf.divide(term_1_0 - term_1_1 + term_1_2 + term_1_3, bag_sigma_sq) term_1 = tf.reduce_sum(term_1_rescale) # Term 2 \sum_j log(2 pi sigma^2_j) term_2 = tf.reduce_sum(tf.log(2.0 * pi * bag_sigma_sq)) # Term 3 tfd = tf.contrib.distributions mvn_q = tfd.MultivariateNormalTriL(loc=tf.squeeze(params['mean']), scale_tril=triangular) mvn_u = tfd.MultivariateNormalTriL(loc=tf.tile(params['prior_mean'], [land_size]), scale_tril=chol_k) term_3 = tf.distributions.kl_divergence(mvn_q, mvn_u) #term_3 = tf.Print(term_3, [0.5* term_1/n_bags, 0.5* term_2/n_bags, term_3/total_size], message='all_terms') term_1_diag = tf.reduce_sum(tf.multiply(k_wz, k_inv_k_wz), axis=0) #diag K_zw K_ww^-1 k_wz #term_1_diag_check = tf.diag_part(tf.matmul(tf.transpose(k_wz), k_inv_k_wz)) k_zw_k_inv_S = tf.matmul(tf.transpose(k_inv_k_wz), Sigma_u) # k_zw K_ww^-1 Sigma_u #term_2_diag_check = tf.diag_part(tf.matmul(k_zw_k_inv_S, k_inv_k_wz)) term_2_diag = tf.reduce_sum(tf.multiply(tf.transpose(k_zw_k_inv_S), k_inv_k_wz), axis=0) # diagonal as [n_indiv] #Sigma_diag_check = Sigma_diag = term_0_diag - term_1_diag + term_2_diag net.Sigma_diag = Sigma_diag = term_0_diag - term_1_diag + term_2_diag #term_1 = tf.Print(term_1, [term_0_diag,term_1_diag, term_2_diag, tf.sqrt(Sigma_diag), tf.sqrt(Sigma_diag_check)], summarize=3, message='Sigma_diag') net.loss = loss = -1.0 / n_bags * (-0.5 * term_1 - 0.5 * term_2) + term_3 / total_size #if MAP: #net.indiv = indiv = tf.exp(mu - Sigma_diag) #else: net.indiv = indiv = mu #tf.squeeze(mu + 0.5 * Sigma_diag)) #indiv = tf.Print(indiv, [indiv], message='mu', summarize=5) #net.indiv = indiv = tf.exp(mu - Sigma_diag) net.indiv_se = net.square_err(inputs['indiv_true_y'], indiv) net.indiv_nll = net.nll_term(inputs['indiv_y'], indiv) #indiv = tf.Print(indiv, [indiv], summarize =200, message='indiv') #indiv_mean = tf.exp(mu + 0.5 * Sigma_diag) net.indiv_y = indiv_y_pop = tf.multiply(inputs['indiv_pop'], indiv) indiv_y_pop = tf.expand_dims(indiv_y_pop, 1) net.bag_y = bag_y = tf.squeeze(net.bag_pool(indiv_y_pop)) bag_y = tf.Print(bag_y, [bag_y, inputs['y']], message='bag') net.bag_se = net.square_err(inputs['y'], bag_y, bags=True) net.bag_nll = net.nll_term(inputs['y'], bag_y, bags=True) #indiv_y_mean = tf.multiply(inputs['indiv_pop'], tf.exp(mu + 0.5 * Sigma_diag)) #indiv_y_var = tf.multiply(tf.exp(Sigma_diag) - 1.0, tf.exp( 2.0* mu + Sigma_diag) ) #indiv_y = tf.Print(indiv_y, [indiv_y_mean, inputs['indiv_y'], indiv_y_var], summarize=2) #net.bag_se = tf.reduce_sum(tf.square(bag_y - inputs['y'])) #if indiv_y_bol: # net.indiv_se = tf.reduce_sum(tf.square(indiv_y - inputs['indiv_y'])) # Can add net.print_out return net
def uncertain_conditional(Xnew_mu, Xnew_var, feat, kern, q_mu, q_sqrt, *, full_cov_output=False, full_cov=False, white=False): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size N x Din :param Xnew_var: covariance matrix of the inputs, size N x Din x Din :param feat: gpflow.InducingFeature object, only InducingPoints is supported :param kern: gpflow kernel or ekernel object. :param q_mu: mean inducing points, size M x Dout :param q_sqrt: cholesky of the covariance matrix of the inducing points, size M x M x Dout :param full_cov_output: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param white: boolean whether to use whitened representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout, size ``fvar`` depends on ``full_cov_output``: if True ``f_var`` is N x Dout x Dout, if False then ``f_var`` is N x Dout """ # TODO: Tensorflow 1.3 doesn't support broadcasting in``tf.matmul`` and # ``tf.matrix_triangular_solve``. This is reported in issue 216. # As a temporary workaround, we are using ``tf.einsum`` for the matrix # multiplications and tiling in the triangular solves. # The code that should be used once the bug is resolved is added in comments. if not isinstance(feat, InducingPoints): raise NotImplementedError if full_cov: # TODO: ``full_cov`` True would return a ``fvar`` of shape N x N x D x D, # encoding the covariance between input datapoints as well. # This is not implemented as this feature is only used for plotting purposes. raise NotImplementedError num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_func = tf.shape(q_mu)[1] # output dimension (D) q_sqrt_r = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M eKuf = tf.transpose(feat.eKfu(kern, Xnew_mu, Xnew_var)) # M x N Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level) # M x M Luu = tf.cholesky(Kuu) # M x M if not white: q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile(Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True) # M x N fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = kern.eKdiag(Xnew_mu, Xnew_var) # N eKuffu = feat.eKufKfu(kern, Xnew_mu, Xnew_var) # N x M x M Luu_tiled = tf.tile(Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(eKuffu), lower=True) Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(Li_eKuffu_Lit), lower=True) # N x M x M cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # D x M x M if full_cov_output: fvar = ( tf.matrix_diag(tf.tile((eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - tf.matmul(fmean[:, :, None], fmean[:, :, None], transpose_b=True) ) else: fvar = ( (eKff - tf.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) - fmean ** 2 ) return fmean, fvar
def _build_graph(self, raw_weights, raw_means, raw_covars, raw_link_covars, raw_priorlink_covars, raw_inducing_inputs, train_inputs, train_outputs, num_train, test_inputs, test_outputs): # normalise weights weights = tf.exp(raw_weights) / tf.reduce_sum(tf.exp(raw_weights)) if self.diag_post: covars = tf.exp(raw_covars) link_covars = None else: covars_list = [None] * self.num_components for i in range(self.num_components): mat = util.vec_to_tri( raw_covars[i, :, :]) #creates mats by row ie r so RxMxM diag_mat = tf.matrix_diag(tf.matrix_diag_part(mat)) exp_diag_mat = tf.matrix_diag(tf.exp(tf.matrix_diag_part(mat))) if self.sparse_post: matcol = tf.expand_dims( (mat - diag_mat)[:, :, 1], 2) # extract first col with first element==0 padding = [[0, 0], [0, 0], [0, self.num_inducing - 1]] covars_list[i] = tf.pad(matcol, padding) + exp_diag_mat else: covars_list[i] = mat - diag_mat + exp_diag_mat covars = tf.stack(covars_list, 0) # create nested list of posterior link parameters #TODO: standardize dummies for prior and post link components (floats vs tensors) #TODO: remove free value for block size==1, replace with fixed==1.0 and check dependents link_covars = [None] * self.num_components for i in range(self.num_components): mat = util.vec_to_tri( raw_link_covars[i, :, :] ) #creates mats by row ie r so R x max(Qr) x max(Qr) diag_mat = tf.matrix_diag(tf.matrix_diag_part(mat)) exp_diag_mat = tf.matrix_diag(tf.exp(tf.matrix_diag_part(mat))) if self.sparse_post: matcol = tf.expand_dims( (mat - diag_mat)[:, :, 1], 2) # extract first col with first element==0 padding = [[0, 0], [0, 0], [0, tf.shape(mat)[2] - 1]] mats_in = tf.pad( matcol, padding) + exp_diag_mat # R x max(Qr) x max(Qr) else: mats_in = mat - diag_mat + exp_diag_mat # R x max(Qr) x max(Qr) # trim ragged block sizes and retain as list mats_in = tf.unstack( mats_in, axis=0) # split into R mats shaped max(Qr) x max(Qr) for r in range(self.num_block): if len(self.block_struct[r] ) == 1: # keep dims where trimmed to scalar mats_in[r] = tf.expand_dims(tf.expand_dims( mats_in[r][0, 0], axis=0), axis=1) else: mats_in[r] = mats_in[r][:len(self.block_struct[r]), : len(self.block_struct[r])] link_covars[i] = mats_in # Both inducing inputs and the posterior means can vary freely so don't change them. means = raw_means inducing_inputs = raw_inducing_inputs # Build the matrices of covariances between inducing inputs. kernel_mat = [ self.kernels[r].kernel(inducing_inputs[r, :, :]) for r in range(self.num_block) ] kernel_chol = [tf.cholesky(k) for k in kernel_mat] # generate freely parameterized K(j,j') for each block of latent functions # where dim (block) = 1 (i.e. independent latent function), mat/chol set == 1 mat = util.vec_to_tri( raw_priorlink_covars ) #creates lower diag mats by row ie r so R x max(Qr) x max(Qr) diag_mat = tf.matrix_diag(tf.matrix_diag_part(mat)) exp_diag_mat = tf.matrix_diag(tf.exp(tf.matrix_diag_part(mat))) if self.sparse_prior: matcol = tf.expand_dims( (mat - diag_mat)[:, :, 1], 2) # extract first col with first element==0 padding = [[0, 0], [0, 0], [0, tf.shape(mat)[2] - 1]] mats_in = tf.pad(matcol, padding) + exp_diag_mat # R x max(Qr) x max(Qr) else: mats_in = mat - diag_mat + exp_diag_mat # R x max(Qr) x max(Qr) kernlink_chol = util.init_list(1.0, [self.num_block]) for r in range(self.num_block): if len(self.block_struct[r]) == 1: # leave as dummy value == 1.0 continue else: kernlink_chol[r] = mats_in[ r, :len(self.block_struct[r]), :len(self.block_struct[r])] # Now build the objective function. entropy = self._build_entropy(weights, means, covars, link_covars) cross_ent = self._build_cross_ent(weights, means, covars, link_covars, kernel_chol, kernlink_chol) ell = self._build_ell(weights, means, covars, link_covars, inducing_inputs, kernel_chol, kernlink_chol, train_inputs, train_outputs) batch_size = tf.to_float(tf.shape(train_inputs)[0]) nelbo = -((batch_size / num_train) * (entropy + cross_ent) + ell) # Finally, build the prediction function. predictions = self._build_predict(weights, means, covars, link_covars, inducing_inputs, kernel_chol, kernlink_chol, test_inputs) # Build the nlpd function. general_nlpd = self._build_nlpd(weights, means, covars, link_covars, inducing_inputs, kernel_chol, kernlink_chol, test_inputs, test_outputs) return nelbo, predictions, general_nlpd
def testWrongDimensions(self): tensor3 = tf.constant([1., 2.]) with self.assertRaises(ValueError): tf.cholesky(tensor3)
def variational_sgpr(X, Z, ls=1., kern_func=rbf, ridge_factor=1e-3, mfvi_mixture=False, n_mixture=1): """Defines the mean-field variational family for GPR. Args: X: (np.ndarray of float32) input training features, with dimension (Nx, D). Z: (np.ndarray of float32) inducing points, with dimension (Nz, D). ls: (float32) length scale parameter. kern_func: (function) kernel function. ridge_factor: (float32) small ridge factor to stabilize Cholesky decomposition mfvi_mixture: (float32) Whether to output variational family with a mixture of MFVI. n_mixture: (int) Number of MFVI mixture component to add. Returns: q_f, q_sig: (ed.RandomVariable) variational family. q_f_mean, q_f_sdev: (tf.Variable) variational parameters for q_f mixture_par_list: (list of tf.Variable) variational parameters for MFVI mixture ('mixture_logits', 'mixture_logits_mfvi_mix', 'mean_mfvi', 'sdev_mfvi') if mfvi_mixture=True, else []. """ X = tf.convert_to_tensor(X) Z = tf.convert_to_tensor(Z) Nx, Nz = X.shape.as_list()[0], Z.shape.as_list()[0] # 1. Prepare constants # compute matrix constants Kxx = kern_func(X, ls=ls) Kxz = kern_func(X, Z, ls=ls) Kzz = kern_func(Z, ls=ls, ridge_factor=ridge_factor) # compute null covariance matrix using Cholesky decomposition Kzz_chol_inv = tf.matrix_inverse(tf.cholesky(Kzz)) Kzz_inv = tf.matmul(Kzz_chol_inv, Kzz_chol_inv, transpose_a=True) Kxz_Kzz_chol_inv = tf.matmul(Kxz, Kzz_chol_inv, transpose_b=True) Kxz_Kzz_inv = tf.matmul(Kxz, Kzz_inv) Sigma_pre = Kxx - tf.matmul( Kxz_Kzz_chol_inv, Kxz_Kzz_chol_inv, transpose_b=True) # 2. Define variational parameters # define mean and variance for sigma q_sig_mean = tf.get_variable(shape=[], name='q_sig_mean') q_sig_sdev = tf.exp(tf.get_variable(shape=[], name='q_sig_sdev')) # define free parameters (i.e. mean and full covariance of f_latent) m = tf.get_variable(shape=[Nz], name='qf_m') s = tf.get_variable( shape=[Nz * (Nz + 1) / 2], # initializer=tf.zeros_initializer(), name='qf_s') L = fill_triangular(s, name='qf_chol') S = tf.matmul(L, L, transpose_b=True) # compute sparse gp variational parameter (i.e. mean and covariance of P(f_obs | f_latent)) qf_mean = tf.tensordot(Kxz_Kzz_inv, m, [[1], [0]], name='qf_mean') qf_cov = ( Sigma_pre + tf.matmul(Kxz_Kzz_inv, tf.matmul(S, Kxz_Kzz_inv, transpose_b=True)) + ridge_factor * tf.eye(Nx, dtype=tf.float32)) # define variational family mixture_par_list = [] if mfvi_mixture: gp_dist = tfd.MultivariateNormalFullCovariance( loc=qf_mean, covariance_matrix=qf_cov) q_f, mixture_par_list = inference_util.make_mfvi_sgp_mixture_family( n_mixture=n_mixture, N=Nx, gp_dist=gp_dist, name='q_f') else: q_f = ed.MultivariateNormalFullCovariance(loc=qf_mean, covariance_matrix=qf_cov, name='q_f') q_sig = ed.Normal(loc=q_sig_mean, scale=q_sig_sdev, name='q_sig') return q_f, q_sig, qf_mean, qf_cov, mixture_par_list
def build_KL(self): """ The covariance of q(u) has a kronecker structure, so appropriate reductions apply for the trace and logdet terms. """ # Mahalanobis term, m^T K^{-1} m Kuu = [ make_Kuu(kern, a, b, self.ms) for kern, a, b, in zip(self.kerns, self.a, self.b) ] Kim = kron_vec_apply(Kuu, self.q_mu, 'solve') KL = 0.5 * tf.reduce_sum(self.q_mu * Kim) # Constant term KL += -0.5 * tf.cast(tf.size(self.q_mu), float_type) # Log det term Ls = [ tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron ] N_others = [float(np.prod(self.Ms)) / M for M in self.Ms] Q_logdets = [ tf.reduce_sum(tf.log(tf.square(tf.diag_part(L)))) for L in Ls ] KL += -0.5 * reduce( tf.add, [N * logdet for N, logdet in zip(N_others, Q_logdets)]) # trace term tr(K^{-1} Sigma_q) Ss = [tf.matmul(L, tf.transpose(L)) for L in Ls] traces = [K.trace_KiX(S) for K, S, in zip(Kuu, Ss)] KL += 0.5 * reduce(tf.multiply, traces) # kron-trace is the produce of traces # log det term Kuu Kuu_logdets = [K.logdet() for K in Kuu] KL += 0.5 * reduce( tf.add, [N * logdet for N, logdet in zip(N_others, Kuu_logdets)]) if self.use_two_krons: # extra logdet terms: Ls_2 = [ tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron_2 ] LiL = [ tf.matrix_triangular_solve(L1, L2) for L1, L2 in zip(Ls, Ls_2) ] eigvals = [ tf.self_adjoint_eig(tf.matmul(tf.transpose(mat), mat))[0] for mat in LiL ] # discard eigenvectors eigvals_kronned = kron([tf.reshape(e, [1, -1]) for e in eigvals]) KL += -0.5 * tf.reduce_sum(tf.log(1 + eigvals_kronned)) # extra trace terms Ss = [tf.matmul(L, tf.transpose(L)) for L in Ls_2] traces = [K.trace_KiX(S) for K, S, in zip(Kuu, Ss)] KL += 0.5 * reduce(tf.multiply, traces) # kron-trace is the produce of traces elif self.use_extra_ranks: # extra logdet terms KiW = kron_mat_apply(Kuu, self.q_sqrt_W, 'solve', self.use_extra_ranks) WTKiW = tf.matmul(tf.transpose(self.q_sqrt_W), KiW) L_extra = tf.cholesky(np.eye(self.use_extra_ranks) + WTKiW) KL += -0.5 * tf.reduce_sum(tf.log(tf.square( tf.diag_part(L_extra)))) # extra trace terms KL += 0.5 * tf.reduce_sum(tf.diag_part(WTKiW)) return KL
def uncertain_conditional(Xnew_mu, Xnew_var, feat, kern, q_mu, q_sqrt, *, mean_function=None, full_cov_output=False, full_cov=False, white=False): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size N x Din :param Xnew_var: covariance matrix of the inputs, size N x Din x Din :param feat: gpflow.InducingFeature object, only InducingPoints is supported :param kern: gpflow kernel or ekernel object. :param q_mu: mean inducing points, size M x Dout :param q_sqrt: cholesky of the covariance matrix of the inducing points, size Dout x M x M :param full_cov_output: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param white: boolean whether to use whitened representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout, size ``fvar`` depends on ``full_cov_output``: if True ``f_var`` is N x Dout x Dout, if False then ``f_var`` is N x Dout """ # TODO: Tensorflow 1.4 doesn't support broadcasting in``tf.matmul`` and # ``tf.matrix_triangular_solve``. This is reported in issue 216. # As a temporary workaround, we are using ``tf.einsum`` for the matrix # multiplications and tiling in the triangular solves. # The code that should be used once the bug is resolved is added in comments. if not isinstance(feat, InducingPoints): raise NotImplementedError if full_cov: # TODO: ``full_cov`` True would return a ``fvar`` of shape N x N x D x D, # encoding the covariance between input datapoints as well. # This is not implemented as this feature is only used for plotting purposes. raise NotImplementedError pXnew = Gaussian(Xnew_mu, Xnew_var) num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_ind = tf.shape(q_mu)[0] # number of inducing points (M) num_func = tf.shape(q_mu)[1] # output dimension (D) q_sqrt_r = tf.matrix_band_part(q_sqrt, -1, 0) # D x M x M eKuf = tf.transpose(expectation(pXnew, (kern, feat))) # M x N (psi1) Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level) # M x M Luu = tf.cholesky(Kuu) # M x M if not white: q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile( Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True) # M x N fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = expectation(pXnew, kern) # N (psi0) eKuffu = expectation(pXnew, (kern, feat), (kern, feat)) # N x M x M (psi2) Luu_tiled = tf.tile( Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(eKuffu), lower=True) Li_eKuffu_Lit = tf.matrix_triangular_solve( Luu_tiled, tf.matrix_transpose(Li_eKuffu_Lit), lower=True) # N x M x M cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # D x M x M if mean_function is None or isinstance(mean_function, mean_functions.Zero): e_related_to_mean = tf.zeros((num_data, num_func, num_func), dtype=settings.float_type) else: # Update mean: \mu(x) + m(x) fmean = fmean + expectation(pXnew, mean_function) # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T, # where m(x) is the mean_function and \mu(x) is fmean e_mean_mean = expectation(pXnew, mean_function, mean_function) # N x D x D Lit_q_mu = tf.matrix_triangular_solve(Luu, q_mu, adjoint=True) e_mean_Kuf = expectation(pXnew, mean_function, (kern, feat)) # N x D x M # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor: e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind]) e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf, Lit_q_mu) # N x D x D e_related_to_mean = e_fmean_mean + tf.matrix_transpose( e_fmean_mean) + e_mean_mean if full_cov_output: fvar = ( tf.matrix_diag( tf.tile( (eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean) else: fvar = ((eKff - tf.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) - fmean**2 + tf.matrix_diag_part(e_related_to_mean)) return fmean, fvar
def cholesky_covariance(x, sample_axis=0, keepdims=False, name=None): """Cholesky factor of the covariance matrix of vector-variate random samples. This function can be use to fit a multivariate normal to data. ```python tf.enable_eager_execution() import tensorflow_probability as tfp tfd = tfp.distributions # Assume data.shape = (1000, 2). 1000 samples of a random variable in R^2. observed_data = read_data_samples(...) # The mean is easy mu = tf.reduce_mean(observed_data, axis=0) # Get the scale matrix L = tfp.stats.cholesky_covariance(observed_data) # Make the best fit multivariate normal (under maximum likelihood condition). mvn = tfd.MultivariateNormalTriL(loc=mu, scale_tril=L) # Plot contours of the pdf. xs, ys = tf.meshgrid( tf.linspace(-5., 5., 50), tf.linspace(-5., 5., 50), indexing='ij') xy = tf.stack((tf.reshape(xs, [-1]), tf.reshape(ys, [-1])), axis=-1) pdf = tf.reshape(mvn.prob(xy), (50, 50)) CS = plt.contour(xs, ys, pdf, 10) plt.clabel(CS, inline=1, fontsize=10) ``` Why does this work? Given vector-variate random variables `X = (X1, ..., Xd)`, one may obtain the sample covariance matrix in `R^{d x d}` (see `tfp.stats.covariance`). The [Cholesky factor](https://en.wikipedia.org/wiki/Cholesky_decomposition) of this matrix is analogous to standard deviation for scalar random variables: Suppose `X` has covariance matrix `C`, with Cholesky factorization `C = L L^T` Then multiplying a vector of iid random variables which have unit variance by `L` produces a vector with covariance `L L^T`, which is the same as `X`. ```python observed_data = read_data_samples(...) L = tfp.stats.cholesky_covariance(observed_data, sample_axis=0) # Make fake_data with the same covariance as observed_data. uncorrelated_normal = tf.random_normal(shape=(500, 10)) fake_data = tf.linalg.matvec(L, uncorrelated_normal) ``` Args: x: Numeric `Tensor`. The rightmost dimension of `x` indexes events. E.g. dimensions of a random vector. sample_axis: Scalar or vector `Tensor` designating axis holding samples. Default value: `0` (leftmost dimension). Cannot be the rightmost dimension (since this indexes events). keepdims: Boolean. Whether to keep the sample axis as singletons. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., `'covariance'`). Returns: chol: `Tensor` of same `dtype` as `x`. The last two dimensions hold lower triangular matrices (the Cholesky factors). """ with tf.name_scope(name, 'cholesky_covariance', values=[x, sample_axis]): sample_axis = tf.convert_to_tensor(sample_axis, dtype=tf.int32) cov = covariance(x, sample_axis=sample_axis, event_axis=-1, keepdims=keepdims) return tf.cholesky(cov)
def symmetric_log_det(x, name=None): """ Compute the log determinant of a symmetric positive definite matrix. """ chol = tf.cholesky(as_tensor(x)) return cholesky_log_det(chol, name)
print("# Convert to tensor") ConvertToTensor = tf.convert_to_tensor( np.array([[1., 2.0, 3.0], [-3.0, -7.0, -1.], [0., 5.0, -2.]])) print(sess.run(ConvertToTensor)) print("##Matric Add Operation") print(sess.run(TwotimeThreeMartix + TwotimeThreeConstantMartic)) print("##matrix Sub Operation") print(sess.run(TwotimeThreeMartix - TwotimeThreeConstantMartic)) print("##Matrix Multiplication") print(sess.run(tf.matmul(TwotimeThreeConstantMartic, identity_matrix))) print("##martix Transpose") print(sess.run(tf.transpose(TwotimeThreeConstantMartic))) print("##Matrix Determinant") print(sess.run(tf.matrix_determinant(ConvertToTensor))) print("##Martic Inverse") print(sess.run(tf.matrix_inverse(ConvertToTensor))) print("## Cholesky Decomposition") print(sess.run(tf.cholesky(identity_matrix))) eigenvalue, eigenvectors = sess.run(tf.self_adjoint_eig(identity_matrix)) print("Value is", eigenvalue) print("Vector is ", eigenvectors)
# it seems like the test and training data need to have the same N X_test, y_test = X_test[:-1, :], y_test[:-1] # unfortunately not sure how to make the linear kernel work at this moment N, P = X_train.shape X_tf = tf.placeholder(tf.float32, [N, P]) # latent stochastic function # ok so here in the loc position is where we can get (x *element-wise* b) b = Bernoulli(varbvs_prior, dtype=np.float32) # prior from varbvs gp_mu = tf.reduce_mean(tf.multiply(X_tf, tf.reshape(tf.tile(b, [N]), [N, P])), 1) # mean for prior over GP f = MultivariateNormalTriL( loc=gp_mu, scale_tril=tf.cholesky( rbf(X_tf)) # uses rbf kernel for covariance of GP for now ) qf = Normal(loc=tf.get_variable("qf/loc", [N]), scale=tf.nn.softplus(tf.get_variable("qf/scale", [N]))) # respose y_tf = Bernoulli(logits=f) # inference infer = ed.KLqp({f: qf}, data={X_tf: X_train, y_tf: y_train}) infer.run(n_samples=3, n_iter=5000) # criticism y_post = ed.copy(y_tf, {f: qf}) ed.evaluate('binary_accuracy', data={X_tf: X_test, y_post: y_test})
def fully_correlated_conditional_repeat(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False): """ This function handles conditioning of multi-output GPs in the case where the conditioning points are all fully correlated, in both the prior and posterior. Note: This conditional can handle 'repetitions' R, given in `f` and `q_sqrt`. :param Kmn: LM x N x P :param Kmm: LM x LM :param Knn: N x P or N x P x N x P :param f: data matrix, LM x R :param q_sqrt: R x LM x LM or R x ML :param full_cov: calculate covariance between inputs :param full_output_cov: calculate covariance between outputs :param white: use whitened representation :return: - mean: R x N x P - variance: R x N x P, R x N x P x P, R x P x N x N, R x N x P x N x P """ logger.debug("fully correlated conditional") R = tf.shape(f)[1] M, N, K = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)] Lm = tf.cholesky(Kmm) # Compute the projection matrix A # Lm: M x M Kmn: M x NK Kmn = tf.reshape(Kmn, (M, N * K)) # M x NK A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # M x NK Ar = tf.reshape(A, (M, N, K)) # compute the covariance due to the conditioning if full_cov and full_output_cov: # fvar = Knn - tf.matmul(Ar, Ar, transpose_a=True) # NK x NK, then reshape? fvar = Knn - tf.tensordot(Ar, Ar, [[0], [0]]) # N x K x N x K elif full_cov and not full_output_cov: At = tf.transpose(Ar) # K x N x M fvar = Knn - tf.matmul(At, At, transpose_b=True) # K x N x N elif not full_cov and full_output_cov: # This transpose is annoying At = tf.transpose(Ar, [1, 0, 2]) # N x M x K # fvar = Knn - tf.einsum('mnk,mnl->nkl', Ar, Ar) fvar = Knn - tf.matmul(At, At, transpose_a=True) # N x K x K elif not full_cov and not full_output_cov: # Knn: N x K fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0]), (N, K)) # Can also do this with a matmul # another backsubstitution in the unwhitened case if not white: # A = tf.matrix_triangular_solve(tf.matrix_transpose(Lm), A, lower=False) # M x NK raise NotImplementedError("Need to verify this.") # pragma: no cover # f: M x R fmean = tf.matmul(f, A, transpose_a=True) # R x M * M x NK -> R x NK fmean = tf.reshape(fmean, (R, N, K)) # R x N x K if q_sqrt is not None: Lf = tf.matrix_band_part(q_sqrt, -1, 0) # R x M x M if q_sqrt.get_shape().ndims == 3: A_tiled = tf.tile(A[None, :, :], tf.stack([R, 1, 1])) # R x M x NK LTA = tf.matmul(Lf, A_tiled, transpose_a=True) # R x M x NK elif q_sqrt.get_shape().ndims == 2: # pragma: no cover raise NotImplementedError("Does not support diagonal q_sqrt yet...") else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov and full_output_cov: addvar = tf.matmul(LTA, LTA, transpose_a=True) # R x NK x NK fvar = fvar[None, :, :, :, :] + tf.reshape(addvar, (R, N, K, N, K)) elif full_cov and not full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, [R, M, N, K]), [0, 3, 1, 2]) # R x K x M x N addvar = tf.matmul(LTAr, LTAr, transpose_a=True) # R x K x N x N fvar = fvar[None, ...] + addvar # R x K x N x N elif not full_cov and full_output_cov: LTAr = tf.transpose(tf.reshape(LTA, (R, M, N, K)), [0, 2, 3, 1]) # R x N x K x M fvar = fvar[None, ...] + tf.matmul(LTAr, LTAr, transpose_b=True) # R x N x K x K elif not full_cov and not full_output_cov: addvar = tf.reshape(tf.reduce_sum(tf.square(LTA), axis=1), (R, N, K)) # R x N x K fvar = fvar[None, ...] + addvar # R x N x K else: fvar = tf.broadcast_to(fvar[None], tf.shape(fmean)) return fmean, fvar
def main(): u.reset_timeit() iters = 11 n = 10000 print(f"Benchmarking n={n}") ############################################################ # Numpy ############################################################ A = scipy.randn(n, n) # random matrix A = A @ A.T # positive definite matrix A = scipy.linalg.cholesky(A) # upper diagonal matrix b = scipy.randn(n) u.reset_timeit() for i in range(iters): with u.timeit('numpy'): scipy.linalg.solve_triangular(A, b) ############################################################ # PyTorch GPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A).cuda() b = torch.randn(n, 1).cuda() # prewarm torch.trtrs(b, A) for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch GPU'): result = torch.trtrs(b, A) torch.cuda.synchronize() del result ############################################################ # PyTorch CPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A) b = torch.randn(n, 1) # prewarm (result, A_clone) = torch.trtrs(b, A) assert result.device.type == 'cpu' for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch CPU'): result = torch.trtrs(b, A) torch.cuda.synchronize() del result ############################################################ # PyTorch GPU ############################################################ A = torch.randn(n, n) A = A @ A.t() + torch.diag(torch.ones(n)) A = torch.potrf(A).cuda() b = torch.randn(n, 1).cuda() # prewarm (result, A_clone) = torch.trtrs(b, A) assert result.device.type == 'cuda' for i in range(iters): torch.cuda.synchronize() with u.timeit('Pytorch GPU'): (result, dummy) = torch.trtrs(b, A) print(result[0, 0]) # torch.cuda.synchronize() del result ############################################################ # Tensorflow GPU ############################################################ A = tf.random_normal((n, n)).gpu() b = tf.random_normal((n, 1)).gpu() A = A @ tf.transpose(A) + tf.diag(tf.ones( (n, ))) # bug, diag is needed, or Cholesky fails A = tf.cholesky(A) # bug, Should be able to do constant conversion, but fails with # Internal: failed to query device pointer for context: CUDA_ERROR_INVALID_VALUE # A = tf.constant(A).gpu() # b = tf.constant(b).gpu() # prewarm result = tf.contrib.eager.Variable(tf.zeros((n, 1))) result.assign(tf.linalg.triangular_solve(A, b)) assert 'gpu' in result.device.lower() for i in range(iters): b += 1 # prevent caching with u.timeit('TF GPU'): result.assign(tf.linalg.triangular_solve(A, b)) print(result[0, 0]) ############################################################ # Tensorflow CPU ############################################################ A = tf.random_normal((n, n)).cpu() b = tf.random_normal((n, 1)).cpu() A = A @ tf.transpose(A) + tf.diag(tf.ones( (n, ))) # bug, diag is needed, or Cholesky fails A = tf.cholesky(A) A = A.cpu() b = b.cpu() # prewarm with tf.device('/cpu:0'): result = tf.contrib.eager.Variable(tf.zeros((n, 1))) result.assign(tf.linalg.triangular_solve(A, b)) assert 'cpu' in result.device.lower() for i in range(iters): b += 1 # prevent caching with u.timeit('TF CPU'): result.assign(tf.linalg.triangular_solve(A, b)) u.summarize_timeit()
def get_sgpr_parameters(self): """Get parameters from a Gpflow Sparse Variational GP Regressor.""" num_inducing_points = len(self.feature) # Reference # https://github.com/GPflow/GPflow/blob/develop/doc/source/notebooks/SGPR_notes.ipynb # # Predictive distribution # p(f*) = Normal(mean=K_{*u} L^{-T}L_B^{-T}c, # cov=K_{**} - K_{*u} L^{-T} (1-B^{-1}) L^{-1} K_{u*}) # # where # u: Inducing points # f: Data points # *: Prediction points # # Code based on SGPR._build_predict with tf.name_scope("Kuf"): # [NUM_INDUCING, NUM_DATA] Kuf = gpflow.features.Kuf(self.feature, self.kern, self.X) with tf.name_scope("Kuu"): # [NUM_INDUCING, NUM_INDUCING] Kuu = gpflow.features.Kuu(self.feature, self.kern, jitter=gpflow.settings.numerics.jitter_level) with tf.name_scope("sigma"): # [] sigma = tf.sqrt(self.likelihood.variance) with tf.name_scope("eye"): # [NUM_INDUCING, NUM_INDUCING] eye = tf.eye(num_inducing_points, dtype=gpflow.settings.float_type) with tf.name_scope("L"): # [NUM_INDUCING, NUM_INDUCING] L = tf.cholesky(Kuu) with tf.name_scope("A"): # [NUM_INDUCING, NUM_DATA] A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma with tf.name_scope("B"): # [NUM_INDUCING, NUM_INDUCING] B = tf.matmul(A, A, transpose_b=True) + eye with tf.name_scope("LB"): # [NUM_INDUCING, NUM_INDUCING] LB = tf.cholesky(B) with tf.name_scope("Ay"): # [NUM_INDUCING, OUT_DIM] Ay = tf.matmul(A, self.Y) with tf.name_scope("c"): # [NUM_INDUCING, OUT_DIM] c = tf.matrix_triangular_solve(LB, Ay, lower=True) / sigma with tf.name_scope("tmp1"): # [NUM_INDUCING, NUM_INDUCING] tmp1 = tf.matrix_triangular_solve(L, eye, lower=True) with tf.name_scope("tmp2"): # [NUM_INDUCING, NUM_INDUCING] tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) with tf.name_scope("alpha"): # [NUM_INDUCING, OUT_DIM] alpha = tf.matmul(tmp2, c, transpose_a=True) return { "inducing_points": self.feature.Z, "coefficients": tf.matrix_transpose(alpha), "signal_variance": self.kern.variance[None], "length_scale": self.kern.lengthscales[None, :], "noise_variance": self.likelihood.variance[None], "gram_L": L[None, :, :], "B_L": LB[None, :, :], }
def draw_GP(Yi, Ti, Xi, ind_kfi, ind_kti, method, gp_params): """ given GP hyperparams and data values at observation times, draw from conditional GP inputs: length,noises,Lf,Kf: GP params Yi: observation values Ti: observation times Xi: grid points (new times for tcn) ind_kfi,ind_kti: indices into Y returns: draws from the GP at the evenly spaced grid times Xi, given hyperparams and data """ n_mc_smps, length, noises, Lf, Kf = gp_params.n_mc_smps, gp_params.length, gp_params.noises, gp_params.Lf, gp_params.Kf M = gp_params.M ny = tf.shape(Yi)[0] K_tt = OU_kernel(length, Ti, Ti) D = tf.diag(noises) grid_f = tf.meshgrid(ind_kfi, ind_kfi) #same as np.meshgrid Kf_big = tf.gather_nd(Kf, tf.stack((grid_f[0], grid_f[1]), -1)) grid_t = tf.meshgrid(ind_kti, ind_kti) Kt_big = tf.gather_nd(K_tt, tf.stack((grid_t[0], grid_t[1]), -1)) Kf_Ktt = tf.multiply(Kf_big, Kt_big) DI_big = tf.gather_nd(D, tf.stack((grid_f[0], grid_f[1]), -1)) DI = tf.diag(tf.diag_part(DI_big)) #D kron I #data covariance. #Either need to take Cholesky of this or use CG / block CG for matrix-vector products Ky = Kf_Ktt + DI + method.add_diag * tf.eye(ny) ### build out cross-covariances and covariance at grid nx = tf.shape(Xi)[0] K_xx = OU_kernel(length, Xi, Xi) K_xt = OU_kernel(length, Xi, Ti) ind = tf.concat([tf.tile([i], [nx]) for i in range(M)], 0) grid = tf.meshgrid(ind, ind) Kf_big = tf.gather_nd(Kf, tf.stack((grid[0], grid[1]), -1)) ind2 = tf.tile(tf.range(nx), [M]) grid2 = tf.meshgrid(ind2, ind2) Kxx_big = tf.gather_nd(K_xx, tf.stack((grid2[0], grid2[1]), -1)) K_ff = tf.multiply(Kf_big, Kxx_big) #cov at grid points full_f = tf.concat([tf.tile([i], [nx]) for i in range(M)], 0) grid_1 = tf.meshgrid(full_f, ind_kfi, indexing='ij') Kf_big = tf.gather_nd(Kf, tf.stack((grid_1[0], grid_1[1]), -1)) full_x = tf.tile(tf.range(nx), [M]) grid_2 = tf.meshgrid(full_x, ind_kti, indexing='ij') Kxt_big = tf.gather_nd(K_xt, tf.stack((grid_2[0], grid_2[1]), -1)) K_fy = tf.multiply(Kf_big, Kxt_big) #now get draws! y_ = tf.reshape(Yi, [-1, 1]) xi = tf.random_normal((nx * M, n_mc_smps)) #print('xi shape:') #print(xi.shape) if method.methodname == 'chol': Ly = tf.cholesky(Ky) Mu = tf.matmul(K_fy, tf.cholesky_solve(Ly, y_)) Sigma = K_ff - tf.matmul(K_fy, tf.cholesky_solve( Ly, tf.transpose(K_fy))) + method.add_diag * tf.eye(tf.shape(K_ff)[0]) #Exp2: increase noise on Sigma 1e-6 to 1e-3, to 1e-1? #Sigma = tf.cast(Sigma, tf.float64) ## Experiment: is chol instable and needs float64? Will this crash Memory? #draw = Mu + tf.matmul(tf.cast(tf.cholesky(Sigma),tf.float32),xi) draw = Mu + tf.matmul(tf.cholesky(Sigma), xi) draw_reshape = tf.transpose(tf.reshape(tf.transpose(draw), [n_mc_smps, M, nx]), perm=[0, 2, 1]) #print('cholesky draw:') #print(sess.run(draw_reshape)) elif method.methodname == 'cg': Mu = tf.matmul(K_fy, CG(Ky, y_)) #May be faster with CG for large problems #Never need to explicitly compute Sigma! Just need matrix products with Sigma in Lanczos algorithm def Sigma_mul(vec): # vec must be a 2d tensor, shape (?,?) return tf.matmul(K_ff, vec) - tf.matmul( K_fy, block_CG(Ky, tf.matmul(tf.transpose(K_fy), vec))) def large_draw(): return Mu + block_Lanczos( Sigma_mul, xi, n_mc_smps) #no need to explicitly reshape Mu #draw = tf.cond(tf.less(nx*M,BLOCK_LANC_THRESH),small_draw,large_draw) draw = large_draw() draw_reshape = tf.transpose(tf.reshape(tf.transpose(draw), [n_mc_smps, M, nx]), perm=[0, 2, 1]) #print('cg draw shape:') #print(draw_reshape.shape) return draw_reshape
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False): """ Given F, representing the GP at the points X, produce the mean and (co-)variance of the GP at the points Xnew. Additionally, there my be Gaussian uncertainty about F as represented by q_sqrt. In this case `f` represents the mean of the distribution and q_sqrt the square-root of the covariance. Additionally, the GP may have been centered (whitened) so that p(v) = N( 0, I) f = L v thus p(f) = N(0, LL^T) = N(0, K). In this case 'f' represents the values taken by v. The method can either return the diagonals of the covariance matrix for each output of the full covariance matrix (full_cov). We assume K independent GPs, represented by the columns of f (and the last dimension of q_sqrt). - Xnew is a data matrix, size N x D - X are data points, size M x D - kern is a GPflow kernel - f is a data matrix, M x K, representing the function values at X, for K functions. - q_sqrt (optional) is a matrix of standard-deviations or Cholesky matrices, size M x K or M x M x K - whiten (optional) is a boolean: whether to whiten the representation as described above. These functions are now considered deprecated, subsumed into this one: gp_predict gaussian_gp_predict gp_predict_whitened gaussian_gp_predict_whitened """ # compute kernel stuff num_data = tf.shape(X)[0] Kmn = kern.K(X, Xnew) Kmm = kern.K(X) + eye(num_data) * 1e-6 Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = kern.K(Xnew) - tf.matmul(A, A, transpose_a=True) shape = tf.pack([tf.shape(f)[1], 1, 1]) else: fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) shape = tf.pack([tf.shape(f)[1], 1]) fvar = tf.tile(tf.expand_dims(fvar, 0), shape) # D x N x N or D x N # another backsubstitution in the unwhitened case if not whiten: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(tf.transpose(A), f) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # D x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.batch_matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # D x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.pack([tf.shape(f)[1], 1, 1])) LTA = tf.batch_matmul(L, A_tiled, adj_x=True) # D x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.batch_matmul(LTA, LTA, adj_x=True) # D x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # D x N fvar = tf.transpose(fvar) # N x D or N x N x D return fmean, fvar
def backward_tensor(self, y): return tf.cholesky(y)
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False): """ Given a g1 and g2, and distribution p and q such that p(g2) = N(g2;0,Kmm) p(g1) = N(g1;0,Knn) p(g1|g2) = N(g1;0,Knm) And q(g2) = N(g2;f,q_sqrt*q_sqrt^T) This method computes the mean and (co)variance of q(g1) = \int q(g2) p(g1|g2) :param Kmn: M x N :param Kmm: M x M :param Knn: N x N or N :param f: M x R :param full_cov: bool :param q_sqrt: None or R x M x M (lower triangular) :param white: bool :return: N x R or R x N x N """ logger.debug("base conditional") # compute kernel stuff num_func = tf.shape(f)[1] # R Lm = tf.cholesky(Kmm) # Compute the projection matrix A A = tf.matrix_triangular_solve(Lm, Kmn, lower=True) # compute the covariance due to the conditioning if full_cov: fvar = Knn - tf.matmul(A, A, transpose_a=True) fvar = tf.tile(fvar[None, :, :], [num_func, 1, 1]) # R x N x N else: fvar = Knn - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(fvar[None, :], [num_func, 1]) # R x N # another backsubstitution in the unwhitened case if not white: A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2) # R x M x N elif q_sqrt.get_shape().ndims == 3: L = tf.matrix_band_part(q_sqrt, -1, 0) # R x M x M A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1])) LTA = tf.matmul(L, A_tiled, transpose_a=True) # R x M x N else: # pragma: no cover raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # R x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x N if not full_cov: fvar = tf.transpose(fvar) # N x R return fmean, fvar # N x R, R x N x N or N x R
def _encode(self, tensor_dict: Dict[bytes, tf.Tensor]) -> tf.Tensor: """Encode an input tensor based on DKF model. Args: tensor_dict: A dict of tensor. Returns: dict of network output tensor. """ train_and_predict_tensors = self._get_train_and_predict_tensors( tensor_dict, self._config.sys_id_len) obs_train_tensor, obs_train_mask_tensor, intervention_train_tensor, obs_to_trigger_tensor, obs_to_trigger_mask_tensor, intervention_to_trigger_tensor, biomarker_boolean_mask_tensor = train_and_predict_tensors # pylint: batch_size = obs_train_tensor.get_shape().as_list()[0] states = self.deep_smooth(obs_train_tensor, intervention_train_tensor, obs_train_mask_tensor) state_for_prediction = self.deep_smooth(obs_to_trigger_tensor, intervention_to_trigger_tensor, obs_to_trigger_mask_tensor) # mu_smooth shape [bs, tlen, _z_dim] mu_smooth = states[0] mu_prediction = state_for_prediction[0] # mu_smooth shape [bs, tlen, _z_dim, _z_dim] sigma_smooth = states[1] sigma_prediction = state_for_prediction[1] # Sample from smoothing distribution if self._config.use_jitter: jitter = 1e-2 * tf.eye( tf.shape(sigma_smooth)[-1], batch_shape=tf.shape(sigma_smooth)[0:-2]) mvn_smooth = tfp.distributions.MultivariateNormalTriL( mu_smooth, sigma_smooth + jitter) else: mvn_smooth = tfp.distributions.MultivariateNormalTriL( mu_smooth, sigma_smooth) # Note the following method is not stable on cholesky op. # mvn_smooth = MultivariateNormalTriL(mu_smooth, tf.cholesky(Sigma_smooth)) # z_smooth shape [bs, tlen, _z_dim]; z_smooth = mvn_smooth.sample() # Transition distribution \prod_{t=2}^T p(z_t|z_{t-1}, u_{t}) # We use tm1 to denote t-1; # state_tran_z_tm1 to denote state_tran(z_{t-1}). # control_tran_u_t to denote control_tran(u_t). # We need to evaluate N(z_t; state_tran_z_tm1 + control_tran_u_t, Q) # Roll left to remove the first input # intervention_tensor: [bs, tlen, _u_dim] z_tm1 = z_smooth[:, :-1, :] u_t = intervention_train_tensor[:, 1:, :] tf.logging.info(u_t) # mu_transition shape [bs * (tlen - 1), _z_dim] mu_transition = tf.reshape( self.state_tran(z_tm1) + self.control_tran(u_t), [-1, self._z_dim]) # z_t_transition [bs * (tlen - 1), _z_dim] z_t_transition = tf.reshape(z_smooth[:, 1:, :], [-1, self._z_dim]) # We transform the rand var to be zero-mean: # N(z_t; Az_tm1 + Bu_t, Q) as N(z_t - Az_tm1 - Bu_t; 0, Q) trans_centered = z_t_transition - mu_transition # mvn_transition [bs * (tlen - 1), self._z_dim] mvn_transition = MultivariateNormalTriL( tf.zeros(self._z_dim), tf.cholesky(self.state_noise)) # log_prob_transition [bs * (tlen - 1)] log_prob_transition = mvn_transition.log_prob(trans_centered) ## Emission distribution \prod_{t=1}^T p(obs_t|z_t) # We need to evaluate N(y_t; Cz_t, R). We write it as N(y_t - Cz_t; 0, R) # z_smooth shape [bs, tlen, z_dim]; # self.obs_emission shape [a_dim, z_dim]; # obs_emission_z_t shape [bs, tlen, _a_dim] obs_emission_z_t = self.obs_emission(z_smooth) obs_emission_z_t_resh = tf.reshape(obs_emission_z_t, [-1, self._out_obs_dim]) # observation tensor reshaped. tf.logging.info(biomarker_boolean_mask_tensor) # [num_obs] tf.logging.info(obs_train_tensor) # [bs, tlen, num_obs] y_t_resh = tf.reshape( tf.transpose( tf.boolean_mask( tf.transpose(obs_train_tensor, [2, 0, 1]), biomarker_boolean_mask_tensor), [1, 2, 0]), [-1, self._out_obs_dim]) emiss_centered = y_t_resh - obs_emission_z_t_resh mask_flat = tf.reshape( tf.transpose( tf.boolean_mask( tf.transpose(obs_train_mask_tensor, [2, 0, 1]), biomarker_boolean_mask_tensor), [1, 2, 0]), [-1, self._out_obs_dim]) # set missing obs emission center to be zero. # emiss_centered shape [bs * tlen, _a_dim] emiss_centered = tf.multiply(mask_flat, emiss_centered) mvn_emission = MultivariateNormalTriL( tf.zeros(self._out_obs_dim), tf.cholesky(self.obs_noise)) # log_prob_emission shape [bs * tlen]. log_prob_emission = mvn_emission.log_prob(emiss_centered) if self._config.pretrain_interv: # Interv distribution \prod_{t=0}^T-1 p(interv_t+1|z_t) interv_forecast_z_t = self.interv_forecast(z_tm1) interv_forecast_z_t_resh = tf.reshape(interv_forecast_z_t, [-1, self._u_dim]) u_t_resh = tf.reshape(u_t, [-1, self._u_dim]) interv_centered = u_t_resh - interv_forecast_z_t_resh mvn_interv = MultivariateNormalTriL( tf.zeros(self._u_dim), tf.cholesky(self.interv_noise)) # log_prob_interv shape [bs * tlen]. log_prob_interv = mvn_interv.log_prob(interv_centered) ## Distribution of the initial state p(z_1|z_0) z_0 = z_smooth[:, 0, :] init_mu = tf.zeros([batch_size, self._z_dim]) init_sigma = tf.reshape( tf.tile( tf.eye(self._z_dim, num_columns=self._z_dim), tf.constant([batch_size, 1])), [batch_size, self._z_dim, self._z_dim]) mvn_0 = MultivariateNormalTriL(init_mu, tf.cholesky(init_sigma)) log_prob_0 = mvn_0.log_prob(z_0) # Entropy log(\prod_{t=1}^T p(z_t|y_{1:T}, u_{1:T})) entropy = -mvn_smooth.log_prob(z_smooth) entropy = tf.reshape(entropy, [-1]) # entropy = tf.zeros(()) log_probs = [ tf.reduce_mean(log_prob_transition), tf.reduce_mean(log_prob_emission), tf.reduce_mean(log_prob_0), tf.reduce_mean(entropy) ] if self._config.pretrain_interv: log_probs = log_probs + [tf.reduce_mean(log_prob_interv)] kf_elbo = tf.reduce_sum(log_probs) state_loss = [ tf.reduce_mean(log_prob_transition), tf.reduce_mean(log_prob_0), tf.reduce_mean(entropy) ] state_only_loss = tf.reduce_sum(state_loss) output = dict() # loss and obs prediction. if self._config.sys_id_len > 0: tlen = self._config.sys_id_len else: tlen = self._config.context_window_size # obs_est starting from t=2 # obs_est only for output prediction, not used for loss computation. output['obs_est'] = tf.reshape(obs_emission_z_t, [-1, tlen, self._out_obs_dim]) # mu_smooth shape [bs, tlen, z_dim]; # final state_encoding shape [bs, z_dim] output['state_encoding'] = mu_prediction[:, -1, :] # final state_traj_encoding shape [bs, tlen, z_dim] output['state_traj_encoding'] = mu_prediction[:, :, :] # full_state_encoding carries mu_smooth[:, -1, :] and # sigma_smooth [:, -1, :, :] to reconstruct the full distribution. # Its shape is [bs, z_dim, z_dim + 1] output['full_state_encoding'] = tf.concat([ tf.expand_dims(mu_prediction[:, -1, :], axis=-1), sigma_prediction[:, -1, :, :] ], axis=2) if self._config.state_only_loss: output['loss'] = -state_only_loss else: output['loss'] = -kf_elbo # output['last_obs'] shape [bs, _out_obs_dim] output['last_obs'] = tf.squeeze( tf.slice(obs_to_trigger_tensor, [0, self._config.context_len_to_trigger - 1, 0], [-1, 1, -1])) if self._config.forecast_biomarkers: # switch shape to [_out_obs_dim, bs] for applying mask. output['last_obs'] = tf.boolean_mask( tf.transpose(output['last_obs']), biomarker_boolean_mask_tensor) # transpose shape back. output['last_obs'] = tf.transpose(output['last_obs']) output['state_loss'] = state_loss tf.logging.info(output) return output
import matplotlib.pyplot as plt import numpy as np import tensorflow as tf sess = tf.Session() x_vals = np.linspace(0, 10, 100) y_vals = x_vals + np.random.normal(0, 1, 100) x_vals_column = np.transpose(np.matrix(x_vals)) ones_column = np.transpose(np.matrix(np.repeat(1, 100))) A = np.column_stack((x_vals_column, ones_column)) b = np.transpose(np.matrix(y_vals)) A_tensor = tf.constant(A) b_tensor = tf.constant(b) A_A = tf.matmul(tf.transpose(A_tensor), A_tensor) L = tf.cholesky(A_A) A_b = tf.matmul(tf.transpose(A_tensor), b) sol1 = tf.matrix_solve(L, A_b) sol2 = tf.matrix_solve(tf.transpose(L), sol1) solution_eval = sess.run(sol2) slope = solution_eval[0][0] y_intercept = solution_eval[1][0] print('slope: ' + str(slope)) print('y_intercept: ' + str(y_intercept)) best_fit = [] for i in x_vals: best_fit.append(slope * i + y_intercept)
def __init__(self, loc=None, covariance_matrix=None, validate_args=False, allow_nan_stats=True, name="MultivariateNormalFullCovariance"): """Construct Multivariate Normal distribution on `R^k`. The `batch_shape` is the broadcast shape between `loc` and `covariance_matrix` arguments. The `event_shape` is given by last dimension of the matrix implied by `covariance_matrix`. The last dimension of `loc` (if provided) must broadcast with this. A non-batch `covariance_matrix` matrix is a `k x k` symmetric positive definite matrix. In other words it is (real) symmetric with all eigenvalues strictly positive. Additional leading dimensions (if any) will index batches. Args: loc: Floating-point `Tensor`. If this is set to `None`, `loc` is implicitly `0`. When specified, may have shape `[B1, ..., Bb, k]` where `b >= 0` and `k` is the event size. covariance_matrix: Floating-point, symmetric positive definite `Tensor` of same `dtype` as `loc`. The strict upper triangle of `covariance_matrix` is ignored, so if `covariance_matrix` is not symmetric no error will be raised (unless `validate_args is True`). `covariance_matrix` has shape `[B1, ..., Bb, k, k]` where `b >= 0` and `k` is the event size. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if neither `loc` nor `covariance_matrix` are specified. """ parameters = dict(locals()) # Convert the covariance_matrix up to a scale_tril and call MVNTriL. with tf.name_scope(name) as name: with tf.name_scope("init", values=[loc, covariance_matrix]): dtype = dtype_util.common_dtype([loc, covariance_matrix], tf.float32) loc = loc if loc is None else tf.convert_to_tensor( loc, name="loc", dtype=dtype) if covariance_matrix is None: scale_tril = None else: covariance_matrix = tf.convert_to_tensor( covariance_matrix, name="covariance_matrix", dtype=dtype) if validate_args: covariance_matrix = control_flow_ops.with_dependencies([ tf.assert_near( covariance_matrix, tf.matrix_transpose(covariance_matrix), message="Matrix was not symmetric") ], covariance_matrix) # No need to validate that covariance_matrix is non-singular. # LinearOperatorLowerTriangular has an assert_non_singular method that # is called by the Bijector. # However, cholesky() ignores the upper triangular part, so we do need # to separately assert symmetric. scale_tril = tf.cholesky(covariance_matrix) super(MultivariateNormalFullCovariance, self).__init__( loc=loc, scale_tril=scale_tril, validate_args=validate_args, allow_nan_stats=allow_nan_stats, name=name) self._parameters = parameters
def draw_GP(Yi, Ti, Xi, ind_kfi, ind_kti): """ given GP hyperparams and data values at observation times, draw from conditional GP inputs: length,noises,Lf,Kf: GP params Yi: observation values Ti: observation times Xi: grid points (new times for rnn) ind_kfi,ind_kti: indices into Y returns: draws from the GP at the evenly spaced grid times Xi, given hyperparams and data """ ny = tf.shape(Yi)[0] K_tt = OU_kernel(length, Ti, Ti) D = tf.diag(noises) grid_f = tf.meshgrid(ind_kfi, ind_kfi) #same as np.meshgrid Kf_big = tf.gather_nd(Kf, tf.stack((grid_f[0], grid_f[1]), -1)) grid_t = tf.meshgrid(ind_kti, ind_kti) Kt_big = tf.gather_nd(K_tt, tf.stack((grid_t[0], grid_t[1]), -1)) Kf_Ktt = tf.multiply(Kf_big, Kt_big) DI_big = tf.gather_nd(D, tf.stack((grid_f[0], grid_f[1]), -1)) DI = tf.diag(tf.diag_part(DI_big)) #D kron I #data covariance. #Either need to take Cholesky of this or use CG / block CG for matrix-vector products Ky = Kf_Ktt + DI + 1e-6 * tf.eye(ny) ### build out cross-covariances and covariance at grid nx = tf.shape(Xi)[0] K_xx = OU_kernel(length, Xi, Xi) K_xt = OU_kernel(length, Xi, Ti) ind = tf.concat([tf.tile([i], [nx]) for i in range(M)], 0) grid = tf.meshgrid(ind, ind) Kf_big = tf.gather_nd(Kf, tf.stack((grid[0], grid[1]), -1)) ind2 = tf.tile(tf.range(nx), [M]) grid2 = tf.meshgrid(ind2, ind2) Kxx_big = tf.gather_nd(K_xx, tf.stack((grid2[0], grid2[1]), -1)) K_ff = tf.multiply(Kf_big, Kxx_big) #cov at grid points full_f = tf.concat([tf.tile([i], [nx]) for i in range(M)], 0) grid_1 = tf.meshgrid(full_f, ind_kfi, indexing='ij') Kf_big = tf.gather_nd(Kf, tf.stack((grid_1[0], grid_1[1]), -1)) full_x = tf.tile(tf.range(nx), [M]) grid_2 = tf.meshgrid(full_x, ind_kti, indexing='ij') Kxt_big = tf.gather_nd(K_xt, tf.stack((grid_2[0], grid_2[1]), -1)) K_fy = tf.multiply(Kf_big, Kxt_big) #now get draws! y_ = tf.reshape(Yi, [-1, 1]) #Mu = tf.matmul(K_fy,CG(Ky,y_)) #May be faster with CG for large problems Ly = tf.cholesky(Ky) Mu = tf.matmul(K_fy, tf.cholesky_solve(Ly, y_)) #TODO: it's worth testing to see at what point computation speedup of Lanczos algorithm is useful & needed. # For smaller examples, using Cholesky will probably be faster than this unoptimized Lanczos implementation. # Likewise for CG and BCG vs just taking the Cholesky of Ky once """ #Never need to explicitly compute Sigma! Just need matrix products with Sigma in Lanczos algorithm def Sigma_mul(vec): # vec must be a 2d tensor, shape (?,?) return tf.matmul(K_ff,vec) - tf.matmul(K_fy,block_CG(Ky,tf.matmul(tf.transpose(K_fy),vec))) def small_draw(): return Mu + tf.matmul(tf.cholesky(Sigma),xi) def large_draw(): return Mu + block_Lanczos(Sigma_mul,xi,n_mc_smps) #no need to explicitly reshape Mu BLOCK_LANC_THRESH = 1000 draw = tf.cond(tf.less(nx*M,BLOCK_LANC_THRESH),small_draw,large_draw) """ xi = tf.random_normal((nx * M, n_mc_smps)) Sigma = K_ff - tf.matmul(K_fy, tf.cholesky_solve( Ly, tf.transpose(K_fy))) + 1e-6 * tf.eye(tf.shape(K_ff)[0]) draw = Mu + tf.matmul(tf.cholesky(Sigma), xi) draw_reshape = tf.transpose(tf.reshape(tf.transpose(draw), [n_mc_smps, M, nx]), perm=[0, 2, 1]) return draw_reshape