def compare_KLs(sess, feed_dict, mu, Q_chol, P_chols): mu_gpflow = tf.transpose(mu) if mu.shape.ndims == 2 else mu[:, None] Q_chol_gpflow = Q_chol if Q_chol.shape.ndims == 3 else Q_chol[None, ...] KL_gpflow = sess.run(gauss_kl(q_mu=mu_gpflow, q_sqrt=Q_chol_gpflow, K=None), feed_dict=feed_dict) KL_gpt = sess.run(KL(mu_diff=mu, Q_chol=Q_chol, P_chol=None, P=None), feed_dict=feed_dict) assert_allclose(KL_gpflow, KL_gpt) for P_chol in P_chols: P_ndims = P_chol.shape.ndims P = tf.square(P_chol) if P_ndims == 1 else tf.matmul( P_chol, P_chol, transpose_b=True) KL_gpflow = sess.run(gauss_kl(q_mu=mu_gpflow, q_sqrt=Q_chol_gpflow, K=tf.diag(P) if P_ndims == 1 else P), feed_dict=feed_dict) KL_gpt = sess.run(KL(mu_diff=mu, Q_chol=Q_chol, P_chol=P_chol, P=None), feed_dict=feed_dict) assert_allclose(KL_gpflow, KL_gpt) KL_gpt = sess.run(KL(mu_diff=mu, Q_chol=Q_chol, P_chol=None, P=P), feed_dict=feed_dict) assert_allclose(KL_gpflow, KL_gpt) KL_gpt = sess.run(KL(mu_diff=mu, Q_chol=Q_chol, P_chol=P_chol, P=P), feed_dict=feed_dict) assert_allclose(KL_gpflow, KL_gpt)
def test_kl_k_cholesky(diag): """ Test that passing K or K_cholesky yield the same answer """ q_mu = Datum.mu q_sqrt = Datum.sqrt_diag if diag else Datum.sqrt kl_K = gauss_kl(q_mu, q_sqrt, K=Datum.K) kl_K_chol = gauss_kl(q_mu, q_sqrt, K_cholesky=Datum.K_cholesky) np.testing.assert_allclose(kl_K.numpy(), kl_K_chol.numpy())
def KL(self): """The KL divergence from variational distribution to the prior.""" if self.white: return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt[None, :, :], None) else: K = self.kernel(self.inducing_points) K += default_jitter() * tf.eye(self.num_inducing, dtype=K.dtype) return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt[None, :, :], K)
def test_kl_k_cholesky(session_tf, mu, sqrt, sqrt_diag, K, K_cholesky, diag): """ Test that passing K or K_cholesky yield the same answer """ kl_K = gauss_kl(mu, sqrt_diag if diag else sqrt, K=K) kl_K_chol = gauss_kl(mu, sqrt_diag if diag else sqrt, K_cholesky=K_cholesky) np.testing.assert_allclose(kl_K.eval(), kl_K_chol.eval())
def test_whitened(session_tf, diag, mu, sqrt_diag, I): """ Check that K=Identity and K=None give same answer """ chol_from_diag = tf.stack([tf.diag(sqrt_diag[:, i]) for i in range(Datum.N)]) # N x M x M s = sqrt_diag if diag else chol_from_diag kl_white = gauss_kl(mu, s) kl_nonwhite = gauss_kl(mu, s, I) np.testing.assert_allclose(kl_white.eval(), kl_nonwhite.eval())
def KL(self): """ The KL divergence from the variational distribution to the prior. q ~ N(\mu, S) :return: KL divergence from q(u) = N(q_mu, q_s) to p(u) ~ N(0, Kuu), independently for each GP """ if self.white: return gauss_kl(self.q_mu, self.q_sqrt, K=None) else: return gauss_kl(self.q_mu, self.q_sqrt, self.MM_Ku_prior)
def test_whitened(session_tf, diag, mu, sqrt_diag, I): """ Check that K=Identity and K=None give same answer """ chol_from_diag = tf.stack( [tf.diag(sqrt_diag[:, i]) for i in range(Datum.N)]) # N x M x M s = sqrt_diag if diag else chol_from_diag kl_white = gauss_kl(mu, s) kl_nonwhite = gauss_kl(mu, s, I) np.testing.assert_allclose(kl_white.eval(), kl_nonwhite.eval())
def test_diags(session_tf, white, mu, sqrt_diag, K): """ The covariance of q(x) can be Cholesky matrices or diagonal matrices. Here we make sure the behaviours overlap. """ # the chols are diagonal matrices, with the same entries as the diag representation. chol_from_diag = tf.stack([tf.diag(sqrt_diag[:, i]) for i in range(Datum.N)]) # N x M x M # run kl_diag = gauss_kl(mu, sqrt_diag, K if white else None) kl_dense = gauss_kl(mu, chol_from_diag, K if white else None) np.testing.assert_allclose(kl_diag.eval(), kl_dense.eval())
def test_unknown_size_inputs(): """ Test for #725 and #734. When the shape of the Gaussian's mean had at least one unknown parameter, `gauss_kl` would blow up. This happened because `tf.size` can only output types `tf.int32` or `tf.int64`. """ mu = np.ones([1, 4], dtype=default_float()) sqrt = np.ones([4, 1, 1], dtype=default_float()) known_shape = gauss_kl(*map(tf.constant, [mu, sqrt])) unknown_shape = gauss_kl(mu, sqrt) np.testing.assert_allclose(known_shape, unknown_shape)
def test_whitened(diag): """ Check that K=Identity and K=None give same answer """ chol_from_diag = tf.stack( [tf.linalg.diag(Datum.sqrt_diag[:, i]) for i in range(Datum.N)] # [N, M, M] ) s = Datum.sqrt_diag if diag else chol_from_diag kl_white = gauss_kl(Datum.mu, s) kl_nonwhite = gauss_kl(Datum.mu, s, Datum.I) np.testing.assert_allclose(kl_white, kl_nonwhite)
def test_diags(session_tf, white, mu, sqrt_diag, K): """ The covariance of q(x) can be Cholesky matrices or diagonal matrices. Here we make sure the behaviours overlap. """ # the chols are diagonal matrices, with the same entries as the diag representation. chol_from_diag = tf.stack( [tf.diag(sqrt_diag[:, i]) for i in range(Datum.N)]) # N x M x M # run kl_diag = gauss_kl(mu, sqrt_diag, K if white else None) kl_dense = gauss_kl(mu, chol_from_diag, K if white else None) np.testing.assert_allclose(kl_diag.eval(), kl_dense.eval())
def test_diags(white): """ The covariance of q(x) can be Cholesky matrices or diagonal matrices. Here we make sure the behaviours overlap. """ # the chols are diagonal matrices, with the same entries as the diag representation. chol_from_diag = tf.stack( [tf.linalg.diag(Datum.sqrt_diag[:, i]) for i in range(Datum.N)] # [N, M, M] ) kl_diag = gauss_kl(Datum.mu, Datum.sqrt_diag, Datum.K if white else None) kl_dense = gauss_kl(Datum.mu, chol_from_diag, Datum.K if white else None) np.testing.assert_allclose(kl_diag, kl_dense)
def KL(self): """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(q_mu, q_sqrt) to N(0, I), independently for each GP """ return gauss_kl(self.q_mu, self.q_sqrt)
def test_sumkl_equals_batchkl_shared_k_not_diag_mocked_tf21(): """ Version of test_sumkl_equals_batchkl with shared_k=True and diag=False that tests the TensorFlow < 2.2 workaround with tiling still works. """ kl_batch = gauss_kl(Datum.mu, Datum.sqrt, Datum.K) kl_sum = [] for n in range(Datum.N): q_mu_n = Datum.mu[:, n][:, None] # [M, 1] q_sqrt_n = Datum.sqrt[n, :, :][None, :, :] # [1, M, M] or [M, 1] K_n = Datum.K # [1, M, M] or [M, M] kl_n = gauss_kl(q_mu_n, q_sqrt_n, K=K_n) kl_sum.append(kl_n) kl_sum = tf.reduce_sum(kl_sum) assert_almost_equal(kl_sum, kl_batch)
def build_prior_KL(self): if self.whiten: K = None else: K = Kuu(self.feature, self.kern, jitter=settings.numerics.jitter_level) # (P x) x M x M return kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K)
def myKL2(self): X = np.array([[1., 2., 3.], [1., 2.1, 3.], [1.1, 2., 3.], [1., 2., 3.1]]) Y = np.array([[1.], [2.], [.2], [3.]]) Z = np.array([[1., 2., 3.], [1.3, 2.2, 3.1]]) A = np.tril(np.random.rand(6, 6)) #"cholesky" of S_M B = np.random.rand(6, 1) #mu_M all_kernels = [ kernels.RBF(3), kernels.RBF(2, lengthscales=3., variance=2.) ] all_Zs, all_mfs = init_linear(X, Z, all_kernels) mylayers = Fully_Coupled_Layers(X, Y, Z, all_kernels, all_mfs, all_Zs, mu_M=B, S_M=A) kl = mylayers.KL() session = get_default_session() kl = session.run(kl) Kmm1 = all_kernels[0].compute_K_symm( all_Zs[0]) + np.eye(Z.shape[0]) * settings.jitter Kmm2 = all_kernels[1].compute_K_symm( all_Zs[1]) + np.eye(all_Zs[1].shape[0]) * settings.jitter K_big = scipy.linalg.block_diag(Kmm1, Kmm1, Kmm2) tfKL = gauss_kl(tf.constant(B), tf.constant(A[np.newaxis]), K=tf.constant(K_big)) sess = tf.Session() return kl, sess.run(tfKL)
def prior_kl(self) -> tf.Tensor: """ The KL divergence from the variational distribution to the prior :return: KL divergence from N(w_mu, w_sqrt) to N(0, I) """ return gauss_kl( self.w_mu[:, None], self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:, None], )
def prior_kl(self) -> tf.Tensor: """ Returns the KL divergence ``KL[q(u)∥p(u)]`` from the prior ``p(u) = N(0, I)`` to the variational distribution ``q(u) = N(w_mu, w_sqrt²)``. """ return gauss_kl( self.w_mu[:, None], self.w_sqrt[None] if not self.is_mean_field else self.w_sqrt[:, None], )
def test_unknown_size_inputs(session_tf): """ Test for #725 and #734. When the shape of the Gaussian's mean had at least one unknown parameter, `gauss_kl` would blow up. This happened because `tf.size` can only output types `tf.int32` or `tf.int64`. """ mu_ph = tf.placeholder(settings.float_type, [None, None]) sqrt_ph = tf.placeholder(settings.float_type, [None, None, None]) mu = np.ones([1, 4], dtype=settings.float_type) sqrt = np.ones([4, 1, 1], dtype=settings.float_type) feed_dict = {mu_ph: mu, sqrt_ph: sqrt} known_shape_tf = gauss_kl(*map(tf.constant, [mu, sqrt])) unknown_shape_tf = gauss_kl(mu_ph, sqrt_ph) known_shape = session_tf.run(known_shape_tf) unknown_shape = session_tf.run(unknown_shape_tf, feed_dict=feed_dict) np.testing.assert_allclose(known_shape, unknown_shape)
def _build_likelihood(self): X = self.X Y = self.Y num_samples = tf.shape(X)[0] if self.whiten: f_mean, f_var = self._build_predict(X, full_cov=False, full_output_cov=False) KL = gauss_kl(self.q_mu, tf.matrix_band_part(self.q_sqrt, -1, 0)) else: f_mean, f_var, Kzz = self._build_predict(X, full_cov=False, full_output_cov=False, return_Kzz=True) KL = gauss_kl(self.q_mu, tf.matrix_band_part(self.q_sqrt, -1, 0), K=Kzz) # compute variational expectations var_exp = self.likelihood.variational_expectations(f_mean, f_var, Y) # scaling for batch size scale = tf.cast(self.num_data, settings.float_type) / tf.cast(num_samples, settings.float_type) return tf.reduce_sum(var_exp) * scale - KL
def build_prior_KL(self, K): if K: KL = 0. for i, k in enumerate(K): KL += gauss_kl_white(self.q_mu[:,(i*self.offset):((i+1)*self.offset)], self.q_sqrt[(i*self.offset):((i+1)*self.offset),:,:], K=k ) return KL else: return gauss_kl(self.q_mu, self.q_sqrt, K=K)
def test_sumkl_equals_batchkl(session_tf, shared_k, diag, mu, sqrt, sqrt_diag, K_batch, K): """ gauss_kl implicitely performs a sum of KL divergences This test checks that doing the sum outside of the function is equivalent For q(X)=prod q(x_l) and p(X)=prod p(x_l), check that sum KL(q(x_l)||p(x_l)) = KL(q(X)||p(X)) Here, q(X) has covariance L x M x M p(X) has covariance L x M x M ( or M x M ) Here, q(x_i) has covariance 1 x M x M p(x_i) has covariance M x M """ s = sqrt_diag if diag else sqrt kl_batch = gauss_kl(mu,s,K if shared_k else K_batch) kl_sum = [] for n in range(Datum.N): kl_sum.append(gauss_kl(mu[:, n][:,None], # M x 1 sqrt_diag[:, n][:, None] if diag else sqrt[n, :, :][None, :, :], # 1 x M x M or M x 1 K if shared_k else K_batch[n, :, :][None,:,:])) # 1 x M x M or M x M kl_sum =tf.reduce_sum(kl_sum) assert_almost_equal(kl_sum.eval(), kl_batch.eval())
def test_oned(session_tf, white, mu, sqrt, K_batch): """ Check that the KL divergence matches a 1D by-hand calculation. """ m = 0 mu1d = mu[m,:][None,:] # 1 x N s1d = sqrt[:,m,m][:,None,None] # N x 1 x 1 K1d = K_batch[:,m,m][:,None,None] # N x 1 x 1 kl = gauss_kl(mu1d,s1d,K1d if not white else None) kl_tf = tf_kl_1d(tf.reshape(mu1d,(-1,)), # N tf.reshape(s1d,(-1,)), # N None if white else tf.reshape(K1d,(-1,))) # N np.testing.assert_allclose(kl.eval(), kl_tf.eval())
def logp(self, X, Y): """ :param X: latent state (n_samples x T x E) :param Y: observations (n_samples x T x D) :return: variational lower bound on \log P(Y|X) (n_samples x T) """ KL = kullback_leiblers.gauss_kl(self.Umu, self.Ucov_chol, None) # () fmean, fvar = self.conditional( X, add_observation_noise=False ) # (n_samples x T x D) and (n_samples x T x D) var_exp = tf.reduce_sum( self.likelihood.variational_expectations(fmean, fvar, Y), -1) # (n_samples x T) return var_exp - KL / tf.cast(tf.shape(X)[1], gp.settings.float_type)
def build_prior_KL(self): # whitening of priors can be implemented here """ This gives KL divergence between inducing points priors and approximated posteriors KL(q(u_g)||p(u_g)) + KL(q(u_f)||p(u_f)) q(u_f) = N(u_f|u_fm,u_fs) p(u_f) = N(u_f|0,Kfmm) q(u_g) = N(u_g|u_gm,u_gs) p(u_g) = N(u_g|0,Kgmm) """ if self.whiten: if self.q_diag: KL = kullback_leiblers.gauss_kl_white_diag(self.u_fm, self.u_fs_sqrt) + \ kullback_leiblers.gauss_kl_white_diag(self.u_gm, self.u_gs_sqrt) else: KL = kullback_leiblers.gauss_kl_white(self.u_fm, self.u_fs_sqrt) + \ kullback_leiblers.gauss_kl_white(self.u_gm, self.u_gs_sqrt) else: Kfmm = self.kernf.K(self.Zf) + tf.eye( self.num_inducing_f, dtype=float_type) * settings.numerics.jitter_level Kgmm = self.kerng.K(self.Zg) + tf.eye( self.num_inducing_g, dtype=float_type) * settings.numerics.jitter_level if self.q_diag: KL = kullback_leiblers.gauss_kl_diag(self.u_fm, self.u_fs_sqrt, Kfmm) + \ kullback_leiblers.gauss_kl_diag(self.u_gm, self.u_gs_sqrt, Kgmm) else: KL = kullback_leiblers.gauss_kl(self.u_fm, self.u_fs_sqrt, Kfmm) + \ kullback_leiblers.gauss_kl(self.u_gm, self.u_gs_sqrt, Kgmm) return KL
def build_prior_KL(self): if self.whiten: if self.q_diag: KL = kullback_leiblers.gauss_kl_white_diag( self.q_mu, self.q_sqrt) else: KL = kullback_leiblers.gauss_kl_white(self.q_mu, self.q_sqrt) else: K = self.kern.K(self.Z) + tf.eye(self.num_inducing, dtype=float_type) * settings.numerics.jitter_level if self.q_diag: KL = kullback_leiblers.gauss_kl_diag(self.q_mu, self.q_sqrt, K) else: KL = kullback_leiblers.gauss_kl(self.q_mu, self.q_sqrt, K) return KL
def test_oned(session_tf, white, mu, sqrt, K_batch): """ Check that the KL divergence matches a 1D by-hand calculation. """ m = 0 mu1d = mu[m, :][None, :] # 1 x N s1d = sqrt[:, m, m][:, None, None] # N x 1 x 1 K1d = K_batch[:, m, m][:, None, None] # N x 1 x 1 kl = gauss_kl(mu1d, s1d, K1d if not white else None) kl_tf = tf_kl_1d( tf.reshape(mu1d, (-1, )), # N tf.reshape(s1d, (-1, )), # N None if white else tf.reshape(K1d, (-1, ))) # N np.testing.assert_allclose(kl.eval(), kl_tf.eval())
def test_sumkl_equals_batchkl(session_tf, shared_k, diag, mu, sqrt, sqrt_diag, K_batch, K): """ gauss_kl implicitely performs a sum of KL divergences This test checks that doing the sum outside of the function is equivalent For q(X)=prod q(x_l) and p(X)=prod p(x_l), check that sum KL(q(x_l)||p(x_l)) = KL(q(X)||p(X)) Here, q(X) has covariance L x M x M p(X) has covariance L x M x M ( or M x M ) Here, q(x_i) has covariance 1 x M x M p(x_i) has covariance M x M """ s = sqrt_diag if diag else sqrt kl_batch = gauss_kl(mu, s, K if shared_k else K_batch) kl_sum = [] for n in range(Datum.N): kl_sum.append( gauss_kl( mu[:, n][:, None], # M x 1 sqrt_diag[:, n][:, None] if diag else sqrt[n, :, :][None, :, :], # 1 x M x M or M x 1 K if shared_k else K_batch[n, :, :][None, :, :])) # 1 x M x M or M x M kl_sum = tf.reduce_sum(kl_sum) assert_almost_equal(kl_sum.eval(), kl_batch.eval())
def test_sumkl_equals_batchkl(shared_k, diag): """ gauss_kl implicitely performs a sum of KL divergences This test checks that doing the sum outside of the function is equivalent For q(X)=prod q(x_l) and p(X)=prod p(x_l), check that sum KL(q(x_l)||p(x_l)) = KL(q(X)||p(X)) Here, q(X) has covariance [L, M, M] p(X) has covariance [L, M, M] ( or [M, M] ) Here, q(x_i) has covariance [1, M, M] p(x_i) has covariance [M, M] """ s = Datum.sqrt_diag if diag else Datum.sqrt kl_batch = gauss_kl(Datum.mu, s, Datum.K if shared_k else Datum.K_batch) kl_sum = [] for n in range(Datum.N): q_mu_n = Datum.mu[:, n][:, None] # [M, 1] q_sqrt_n = ( Datum.sqrt_diag[:, n][:, None] if diag else Datum.sqrt[n, :, :][None, :, :] ) # [1, M, M] or [M, 1] K_n = Datum.K if shared_k else Datum.K_batch[n, :, :][None, :, :] # [1, M, M] or [M, M] kl_n = gauss_kl(q_mu_n, q_sqrt_n, K=K_n) kl_sum.append(kl_n) kl_sum = tf.reduce_sum(kl_sum) assert_almost_equal(kl_sum, kl_batch)
def test_oned(white, dim): """ Check that the KL divergence matches a 1D by-hand calculation. """ mu1d = Datum.mu[dim, :][None, :] # [1, N] s1d = Datum.sqrt[:, dim, dim][:, None, None] # [N, 1, 1] K1d = Datum.K_batch[:, dim, dim][:, None, None] # [N, 1, 1] kl = gauss_kl(mu1d, s1d, K1d if not white else None) kl_1d = compute_kl_1d( tf.reshape(mu1d, (-1,)), # N tf.reshape(s1d, (-1,)), # N None if white else tf.reshape(K1d, (-1,)), ) # N np.testing.assert_allclose(kl, kl_1d)
def test_local_kl_gpflow_consistency(w_dim): num_data = 400 means = np.random.randn(num_data, w_dim) encoder = DirectlyParameterizedNormalDiag(num_data, w_dim, means) lv = LatentVariableLayer(encoder=encoder, prior=_zero_one_normal_prior(w_dim)) posteriors = lv._inference_posteriors( [np.random.randn(num_data, 3), np.random.randn(num_data, 2)]) q_mu = posteriors.parameters["loc"] q_sqrt = posteriors.parameters["scale_diag"] gpflow_local_kls = gauss_kl(q_mu, q_sqrt) tfp_local_kls = tf.reduce_sum(lv._local_kls(posteriors)) np.testing.assert_allclose(tfp_local_kls, gpflow_local_kls, rtol=1e-10)
def build_prior_KL(self, K): return gauss_kl(self.q_mu, self.q_sqrt, K=K)
def prior_kl(self, Kuu): """ KL divergence between p(u) = N(0, Kuu) and q(u) = N(μ, S) """ return kullback_leiblers.gauss_kl(self.q_mu[:, None], self.q_sqrt[None, :, :], Kuu)
def KL(self): Ku = None if self.white else self.Ku return gauss_kl(self.Um, self.Us_sqrt, Ku)