def predict_f(self, Xnew: InputData, full_cov: bool = False, full_output_cov: bool = False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points. Note that this is very similar to the SGPR prediction, for which there are notes in the SGPR notebook. Note: This model does not allow full output covariances. :param Xnew: points at which to predict """ if full_output_cov: raise NotImplementedError pX = DiagonalGaussian(self.X_data_mean, self.X_data_var) Y_data = self.data num_inducing = self.inducing_variable.num_inducing psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0, ) jitter = default_jitter() Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) L = tf.linalg.cholesky( covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True) mean = tf.linalg.matmul(tmp2, c, transpose_a=True) if full_cov: var = (self.kernel(Xnew) + tf.linalg.matmul(tmp2, tmp2, transpose_a=True) - tf.linalg.matmul(tmp1, tmp1, transpose_a=True)) shape = tf.stack([1, 1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = (self.kernel(Xnew, full_cov=False) + tf.reduce_sum(tf.square(tmp2), axis=0) - tf.reduce_sum(tf.square(tmp1), axis=0)) shape = tf.stack([1, tf.shape(Y_data)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def build_cholesky_if_needed(self): # # make sure we only compute this once # if self.needs_build_cholesky: self.Ku = covs.Kuu(self.feature, self.kern, jitter=gpflow.default_jitter()) self.Lu = tf.linalg.cholesky(self.Ku) self.Ku_tiled = tf.tile(self.Ku[None, :, :], [self.num_outputs, 1, 1]) self.Lu_tiled = tf.tile(self.Lu[None, :, :], [self.num_outputs, 1, 1])
def __call__(self, Xnew, full_cov=False, full_output_cov=False): q_mu = self.q_mu # M x K x O q_sqrt = self.q_sqrt # K x O x M x M Kuu = covariances.Kuu(self.inducing_variables, self.kernel, jitter=default_jitter()) # K x M x M Kuf = covariances.Kuf(self.inducing_variables, self.kernel, Xnew) # K x M x N Knn = self.kernel.K(Xnew, full_output_cov=False)
def compute_qu(self, full_cov: bool = True) -> Tuple[tf.Tensor, tf.Tensor]: """ Computes the mean and variance of q(u) = N(mu, cov), the variational distribution on inducing outputs. SVGP with this q(u) should predict identically to SGPR. The derivation is at follows: q(u)=N(u | m, S) with: S=Kuu^{-1}+ [Kuu^{-1}* Kuf * Kfu * Kuu^{-1} * beta] m=S^{-1} Kuu^{-1} Kuf y beta were sigma^-2 = beta :return: mu, cov """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) # num_inducing = self.inducing_variable.num_inducing #E_qx[Kfu] psi1 = expectation(pX, (self.kernel, self.inducing_variable)) #E_qx[Kuf@Kfu] psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) kuu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) kuf = tf.transpose(psi1) sig = kuu + psi2 * (self.likelihood.variance**-1) sig_sqrt = tf.linalg.cholesky(sig) sig_sqrt_kuu = tf.linalg.triangular_solve(sig_sqrt, kuu) # [M,M] -> [M(M +1)//2] =/= [M,D] cov = tf.linalg.matmul(sig_sqrt_kuu, sig_sqrt_kuu, transpose_a=True) err = Y_data - self.mean_function(X_data_mean) mu = (tf.linalg.matmul(sig_sqrt_kuu, tf.linalg.triangular_solve( sig_sqrt, tf.linalg.matmul(kuf, err)), transpose_a=True) / self.likelihood.variance) if not full_cov: return mu, cov else: return mu, tf.tile(cov[None, :, :], [mu.shape[-1], 1, 1])
def _test_cg_svgp(config: ConfigDense, model: SVGP, Xnew: tf.Tensor) -> tf.Tensor: """ Sample generation subroutine common to each unit test """ # Prepare preconditioner for CG Z = model.inducing_variable Kff = covariances.Kuu(Z, model.kernel, jitter=0) max_rank = config.num_cond//(2 if config.num_cond > 1 else 1) preconditioner = get_default_preconditioner(Kff, diag=default_jitter(), max_rank=max_rank) count = 0 samples = [] L_joint = None while count < config.num_samples: # Sample $u ~ N(q_mu, q_sqrt q_sqrt^{T})$ size = min(config.shard_size, config.num_samples - count) shape = model.num_latent_gps, config.num_cond, size rvs = tf.random.normal(shape=shape, dtype=floatx()) u = tf.transpose(model.q_sqrt @ rvs) # Generate draws from the joint distribution $p(f(X), g(Z))$ (f, fnew), L_joint = common.sample_joint(model.kernel, Z, Xnew, num_samples=size, L=L_joint) # Solve for update functions update_fns = cg_update(model.kernel, Z, u, f, tol=1e-6, max_iter=config.num_cond, preconditioner=preconditioner) samples.append(fnew + update_fns(Xnew)) count += size samples = tf.concat(samples, axis=0) if model.mean_function is not None: samples += model.mean_function(Xnew) return samples
def _precompute(self): Kuu = cov.Kuu(self.inducing_variable, self.kernel) # this is now a LinearOperator q_mu = self._q_dist.q_mu q_sqrt = self._q_dist.q_sqrt if self.whiten: raise NotImplementedError else: # alpha = Kuu⁻¹ q_mu alpha = Kuu.solve(q_mu) # type: tf.Tensor if self.whiten: raise NotImplementedError else: # Qinv = Kuu⁻¹ - Kuu⁻¹ S Kuu⁻¹ KuuInv_qsqrt = Kuu.solve(q_sqrt) KuuInv_covu_KuuInv = tf.matmul(KuuInv_qsqrt, KuuInv_qsqrt, transpose_b=True) Qinv = Kuu.inverse().to_dense() - KuuInv_covu_KuuInv return alpha, Qinv
def custom_predict_f(self, Xnew: InputData, full_cov: bool = False, full_output_cov: bool = False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points. Note that this is very similar to the SGPR prediction, for which there are notes in the SGPR notebook. Note: This model does not allow full output covariances. :param Xnew: points at which to predict """ if full_output_cov: raise NotImplementedError Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) mu, cov = self.compute_qu() jitter = default_jitter() Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) L = tf.linalg.cholesky( covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) var = cov tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us} tmp2 = tf.linalg.triangular_solve(L, mu, lower=True) # L^{-1} m mean = tf.linalg.matmul( tmp1, tmp2, transpose_a=True ) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook return mean + self.mean_function(Xnew), var
def uncertain_conditional_diag( Xnew_mu: tf.Tensor, Xnew_var: tf.Tensor, inducing_variable: InducingVariables, kernel: Kernel, q_mu, q_sqrt, *, mean_function=None, full_output_cov=False, full_cov=False, white=False, ): """ Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var). See ``conditional`` documentation for further reference. :param Xnew_mu: mean of the inputs, size [N, D]in :param Xnew_var: covariance matrix of the inputs, size [N, n, n] :param inducing_variable: gpflow.InducingVariable object, only InducingPoints is supported :param kernel: gpflow kernel object. :param q_mu: mean inducing points, size [M, Dout] :param q_sqrt: cholesky of the covariance matrix of the inducing points, size [t, M, M] :param full_output_cov: boolean wheter to compute covariance between output dimension. Influences the shape of return value ``fvar``. Default is False :param white: boolean whether to use whitened representation. Default is False. :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is [N, Dout], size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is [N, t, t], if False then ``f_var`` is [N, Dout] """ if not isinstance(inducing_variable, InducingPoints): raise NotImplementedError if full_cov: raise NotImplementedError( "uncertain_conditional() currently does not support full_cov=True") # pX = DiagonalGaussian(self.X_data_mean, self.X_data_var) # Y_data = self.data # mu, cov = self.compute_qu() # jitter = default_jitter() # Kus = covariances.Kuf(self.inducing_variable, self.kernel, Xnew) # L = tf.linalg.cholesky(covariances.Kuu(self.inducing_variable, self.kernel, jitter=jitter)) # var = cov # tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) #L^{-1} K_{us} # tmp2 = tf.linalg.triangular_solve(L, mu, lower=True) # L^{-1} m # mean = tf.linalg.matmul(tmp1, tmp2, transpose_a=True) #K_{su} L^{-T} L^{-1} m = K_{su} K_{uu}^{-1} m #ook # return mean + self.mean_function(Xnew), var pXnew = DiagonalGaussian(Xnew_mu, Xnew_var) num_data = tf.shape(Xnew_mu)[0] # number of new inputs (N) num_ind, num_func = tf.unstack( tf.shape(q_mu), num=2, axis=0) # number of inducing points (M), output dimension (D) q_sqrt_r = tf.linalg.band_part( q_sqrt, -1, 0) # [D, M, M] #taking the lower triangular part eKuf = tf.transpose(expectation( pXnew, (kernel, inducing_variable))) # [M, N] (psi1) Kuu = covariances.Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] Luu = tf.linalg.cholesky(Kuu) # [M, M] if not white: q_mu = tf.linalg.triangular_solve(Luu, q_mu, lower=True) Luu_tiled = tf.tile( Luu[None, :, :], [num_func, 1, 1]) # remove line once issue 216 is fixed q_sqrt_r = tf.linalg.triangular_solve(Luu_tiled, q_sqrt_r, lower=True) Li_eKuf = tf.linalg.triangular_solve(Luu, eKuf, lower=True) # [M, N] fmean = tf.linalg.matmul(Li_eKuf, q_mu, transpose_a=True) eKff = expectation(pXnew, kernel) # N (psi0) eKuffu = expectation(pXnew, (kernel, inducing_variable), (kernel, inducing_variable)) # [N, M, M] (psi2) Luu_tiled = tf.tile( Luu[None, :, :], [num_data, 1, 1]) # remove this line, once issue 216 is fixed Li_eKuffu = tf.linalg.triangular_solve(Luu_tiled, eKuffu, lower=True) Li_eKuffu_Lit = tf.linalg.triangular_solve(Luu_tiled, tf.linalg.adjoint(Li_eKuffu), lower=True) # [N, M, M] cov = tf.linalg.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True) # [D, M, M] if mean_function is None or isinstance(mean_function, mean_functions.Zero): e_related_to_mean = tf.zeros((num_data, num_func, num_func), dtype=default_float()) else: # Update mean: \mu(x) + m(x) fmean = fmean + expectation(pXnew, mean_function) # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T, # where m(x) is the mean_function and \mu(x) is fmean e_mean_mean = expectation(pXnew, mean_function, mean_function) # [N, D, D] Lit_q_mu = tf.linalg.triangular_solve(Luu, q_mu, adjoint=True) e_mean_Kuf = expectation(pXnew, mean_function, (kernel, inducing_variable)) # [N, D, M] # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor: e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind]) e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf, Lit_q_mu) # [N, D, D] e_related_to_mean = e_fmean_mean + tf.linalg.adjoint( e_fmean_mean) + e_mean_mean if full_output_cov: fvar = ( tf.linalg.diag( tf.tile((eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) + tf.linalg.diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) + # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) + tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) - # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - fmean[:, :, None] * fmean[:, None, :] + e_related_to_mean) else: fvar = ( (eKff - tf.linalg.trace(Li_eKuffu_Lit))[:, None] + tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) # tf.linalg.diag(tf.linalg.trace(tf.linalg.matmul(Li_eKuffu_Lit, cov))) + + tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) # tf.linalg.matmul(q_mu, tf.linalg.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) - - fmean**2 + tf.linalg.diag_part(e_related_to_mean)) return fmean, fvar
def conditional_vff(Xnew, inducing_variable, kernel, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False): """ - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d) - feat is an instance of features.InducingFeature that provides `Kuu` and `Kuf` methods for Fourier features, this contains the limits of the bounding box and the frequencies - f is the value (or mean value) of the features (i.e. the weights) - q_sqrt (default None) is the Cholesky factor of the uncertainty about f (to be propagated through the conditional as per the GPflow inducing-point implementation) - white (defaults False) specifies whether the whitening has been applied Given the GP represented by the inducing points specified in `feat`, produce the mean and (co-)variance of the GP at the points Xnew. Xnew :: N x D Kuu :: M x M Kuf :: M x N f :: M x K, K = 1 q_sqrt :: K x M x M, with K = 1 """ if full_output_cov: raise NotImplementedError # num_data = tf.shape(Xnew)[0] # M num_func = tf.shape(f)[1] # K Kuu = cov.Kuu(inducing_variable, kernel) # this is now a LinearOperator Kuf = cov.Kuf(inducing_variable, kernel, Xnew) # still a Tensor KuuInv_Kuf = Kuu.solve(Kuf) # compute the covariance due to the conditioning if full_cov: fvar = kernel(Xnew) - tf.matmul(Kuf, KuuInv_Kuf, transpose_a=True) shape = (num_func, 1, 1) else: KufT_KuuInv_Kuf_diag = tf.reduce_sum(Kuf * KuuInv_Kuf, axis=-2) fvar = kernel(Xnew, full=False) - KufT_KuuInv_Kuf_diag shape = (num_func, 1) fvar = tf.expand_dims(fvar, 0) * tf.ones( shape, dtype=gpflow.default_float()) # K x N x N or K x N # another backsubstitution in the unwhitened case if white: raise NotImplementedError A = KuuInv_Kuf # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: # LTA = A * tf.expand_dims(q_sqrt, 2) # K x M x N # won't work # make ticket for this? raise NotImplementedError elif q_sqrt.get_shape().ndims == 3: # L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # K x M x M # K x M x N # A_tiled = tf.expand_dims(A.get(), 0) * tf.ones((num_func, 1, 1), dtype=float_type) # LTA = tf.matmul(L, A_tiled, transpose_a=True) # K x M x N # TODO the following won't work for K > 1 assert q_sqrt.shape[0] == 1 # LTA = (A.T @ DenseMatrix(q_sqrt[:,:,0])).T.get()[None, :, :] ATL = tf.matmul(A, q_sqrt, transpose_a=True) else: raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: # fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N fvar = fvar + tf.matmul(ATL, ATL, transpose_b=True) # K x N x N else: # fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = fvar + tf.reduce_sum(tf.square(ATL), 2) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar
def prior_kl_vff(inducing_variable, kernel, q_mu, q_sqrt, whiten=False): if whiten: raise NotImplementedError K = cov.Kuu(inducing_variable, kernel) return gauss_kl_vff(q_mu, q_sqrt, K)
def _conditional_fused(self, Xnew, full_cov, full_output_cov): """ Xnew is a tensor with the points of the data or minibatch, shape N x D """ if full_output_cov: raise NotImplementedError f = self._q_dist.q_mu q_sqrt = self._q_dist.q_sqrt # num_data = tf.shape(Xnew)[0] # M num_func = tf.shape(f)[1] # K Kuu = cov.Kuu(self.X_data, self.kernel) # this is now a LinearOperator Kuf = cov.Kuf(self.X_data, self.kernel, Xnew) # still a Tensor KuuInv_Kuf = Kuu.solve(Kuf) # compute the covariance due to the conditioning if full_cov: fvar = self.kernel(Xnew) - tf.matmul( Kuf, KuuInv_Kuf, transpose_a=True) shape = (num_func, 1, 1) else: KufT_KuuInv_Kuf_diag = tf.reduce_sum(Kuf * KuuInv_Kuf, axis=-2) fvar = self.kernel(Xnew, full_cov=False) - KufT_KuuInv_Kuf_diag shape = (num_func, 1) fvar = tf.expand_dims(fvar, 0) * tf.ones( shape, dtype=gpflow.default_float()) # K x N x N or K x N if self.whiten: raise NotImplementedError A = KuuInv_Kuf # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.get_shape().ndims == 2: # LTA = A * tf.expand_dims(q_sqrt, 2) # K x M x N # won't work # make ticket for this? raise NotImplementedError elif q_sqrt.get_shape().ndims == 3: # L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0) # K x M x M # K x M x N # A_tiled = tf.expand_dims(A.get(), 0) * tf.ones((num_func, 1, 1), dtype=float_type) # LTA = tf.matmul(L, A_tiled, transpose_a=True) # K x M x N # TODO the following won't work for K > 1 assert q_sqrt.shape[0] == 1 # LTA = (A.T @ DenseMatrix(q_sqrt[:,:,0])).T.get()[None, :, :] ATL = tf.matmul(A, q_sqrt, transpose_a=True) else: raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: # fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N fvar = fvar + tf.matmul(ATL, ATL, transpose_b=True) # K x N x N else: # fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = fvar + tf.reduce_sum(tf.square(ATL), 2) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ # defining a sets of vectorized function for usage in `tf.vectorized_map` # take the outer product of a pair of rows @tf.function def row_outer_product(args): a, b = args a = tf.expand_dims(a, -1) b = tf.expand_dims(b, -1) return a @ tf.transpose(b) # repeat matrix A N times on a newly created first axis # so the new shape is [N, A.shape] @tf.function def repeat_N(A): return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0) @tf.function def triang_solve(args): L, rhs = args return tf.linalg.triangular_solve(L, rhs) @tf.function def triang_solve_transpose(args): L, rhs = args return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False) @tf.function def matmul_vectorized(args): A, B = args return tf.matmul(A, B) # [N, D, M, M] --> [N] # each term is sum_{d=1}^D Tr[M, M] # arg: [D, M, M], needs to be squared @tf.function def sum_d_trace(arg): trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg) return tf.reduce_sum(trace_D) # trace of a matrix @tf.function def trace_tf(A): return tf.reduce_sum(tf.linalg.diag_part(A)) Y = self.data qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var) psi0s = expectation(qXs, self.kernel_s) psi1s = expectation(qXs, (self.kernel_s, self.Zs)) psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs)) cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) Ls = tf.linalg.cholesky(cov_uu_s) Ls = repeat_N(Ls) # [N x M x M] # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together # for each k: psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k # psi0 is [N, K] so psi0[n, k] gives a real value # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var) psi0k = [] psi1k = [] psi2k = [] psi2ks = [] psi2sk = [] for k, kernel_k in enumerate(self.kernel_K): psi0 = expectation(qXp, kernel_k) psi1 = expectation(qXp, (kernel_k, self.Zp)) psi2 = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp)) psi0k.append(psi0) psi1k.append(psi1) psi2k.append(psi2) # add the cross-covariance terms, require computation separately for each n psi2sk.append(tf.vectorized_map(row_outer_product, (psi1s, psi1))) #psi2ks.append(tf.vectorized_map(row_outer_product, (psi1, psi1s))) psi0k = tf.stack(psi0k, axis=-1) psi1k = tf.stack(psi1k, axis=-1) psi2k = tf.stack(psi2k, axis=-1) psi2sk = tf.stack(psi2sk, axis=-1) #psi2ks = tf.stack(psi2ks, axis=-1) # make K cov_uu_k using Zp and kernel_k # K cholesky, repeat N times for later use # L is [N x M x M x K] # these are the Kuu matrices Lk = [] for k, kernel_k in enumerate(self.kernel_K): cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter()) Lk.append(tf.linalg.cholesky(cov_uu_k)) Lk = tf.stack(Lk, axis=-1) Lk = repeat_N(Lk) sigma2 = self.likelihood.variance jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float()) tmp = tf.vectorized_map(triang_solve, (Ls, psi2s)) As = tf.vectorized_map(triang_solve_transpose, (Ls, tmp)) # \inv{Kuu^s} * Psi2s: [N, M, M] LBs = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2s) # [N, M, M] tmp1 = tf.vectorized_map(triang_solve, (Ls, LBs)) # [N, M, M] Cs = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1)) # sqrt(\inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, M, M] Ds = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), Cs)) # sqrt(Ms^T * \inv{Kuu^s} * Psi2s * \inv{Kuu^s} * Ms): [N, D, M] Fs = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt_s, perm=[0, 2, 1])), Cs)) # sqrt(Ss * \inv{Kuu^s} * Psi2s * \inv{Kuu^s}): [N, D, M, M] tmp2 = tf.vectorized_map(triang_solve, (Ls, repeat_N(self.q_mu_s))) Es = tf.vectorized_map(triang_solve_transpose, (Ls, tmp2)) # \inv{Kuu^s} * Ms: [N, M, D] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1s)) # Y^T * Psi1: [N, D, M] Gs = tf.vectorized_map(matmul_vectorized, (tmp3, Es)) # Y^T * Psi1s * \inv{Kuu^s} * Ms: [N, D, D] Fq = [] Yn2 = tf.reduce_sum(tf.square(Y), axis=1) for k in range(self.K): tmp = tf.vectorized_map(triang_solve, (Lk[..., k], psi2k[..., k])) # [N, M, M] Ak = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp)) # \inv{Kuu^k} * Psi2k: [N, M, M] LBk = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2k[..., k]) # [N, M, M] tmp1k = tf.vectorized_map(triang_solve, (Lk[..., k], LBk)) # [N, M, M] Ck = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp1k)) # sqrt(\inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, M, M] Dk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), Ck)) # sqrt(Mk^T * \inv{Kuu^k} * Psi2k * \inv{Kuu^k} * Mk): [N, D, M] # q_sqrt is already the cholesky Fk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), Ck)) # sqrt(Sk * \inv{Kuu^k} * Psi2k * \inv{Kuu^k}): [N, D, M, M] tmp2 = tf.vectorized_map(triang_solve, (Lk[..., k], repeat_N(self.q_mu[k]))) Ek = tf.vectorized_map(triang_solve_transpose, (Lk[..., k], tmp2)) # \inv{Kuu^k} * Mk: [N, M, D] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1k[..., k])) # Y^T * Psi1k: [N, D, M] Gk = tf.vectorized_map(matmul_vectorized, (tmp3, Ek)) # Y^T * Psi1k * \inv{Kuu^k} * Mk: [N, D, D] # compute the cross terms tmp1sk = tf.vectorized_map(triang_solve, (Ls, psi2sk[..., k])) tmp2sk = tf.vectorized_map(triang_solve_transpose, (Ls, tmp1sk)) # \inv{Kuu^s} * Psi2sk: [N, M, M] tmp3sk = tf.vectorized_map(matmul_vectorized, (tmp2sk, Ek)) # \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, M, D] Dsk = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu_s)), tmp3sk)) # Ms^T * \inv{Kuu^s} * Psi2sk * \inv{Kuu^k} * Mk: [N, D, D] # compute the lower bound # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k Fnk = -0.5 * Yn2 / sigma2 Fnk += tf.vectorized_map(trace_tf, Gs + Gk) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Ds) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), Dk) / sigma2 # the sum of trace of the 2 cross terms is 2 times the trace of one since they are transpose of one another Fnk += - tf.vectorized_map(trace_tf, Dsk) / sigma2 Fnk += 0.5 * self.D * tf.vectorized_map(trace_tf, As + Ak) / sigma2 Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fs) / sigma2 Fnk += -0.5 * tf.vectorized_map(sum_d_trace, Fk) / sigma2 Fq.append(Fnk) Fq = tf.stack(Fq, axis=-1) # [N, K] # psi0 is already [N, K] Fq += -0.5 * self.D * (tf.repeat(tf.expand_dims(psi0s, -1), self.K, axis=1) + psi0k) / sigma2 Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2) # weight each entry by the mixture responsibility, then sum over N, K bound = tf.reduce_sum(Fq * self.pi) # compute KL KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var) KL_c = self.kl_categorical(self.pi, self.pi_prior) KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var) prior_Kuu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) KL_us = kullback_leiblers.gauss_kl(q_mu=self.q_mu_s, q_sqrt=self.q_sqrt_s, K=prior_Kuu_s) KL_uk = 0 for k in range(self.K): prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter()) KL_uk += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu_k) bound += - KL_s - KL_p - KL_us - KL_uk - KL_c return bound
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ # defining a sets of vectorized function for usage in `tf.vectorized_map` # take the outer product of a pair of rows @tf.function def row_outer_product(args): a, b = args a = tf.expand_dims(a, -1) b = tf.expand_dims(b, -1) return a @ tf.transpose(b) # repeat matrix A N times on a newly created first axis # so the new shape is [N, A.shape] @tf.function def repeat_N(A): return tf.repeat(tf.expand_dims(A, 0), self.N, axis=0) @tf.function def triang_solve(args): L, rhs = args return tf.linalg.triangular_solve(L, rhs) @tf.function def triang_solve_transpose(args): L, rhs = args return tf.linalg.triangular_solve(tf.transpose(L), rhs, lower=False) @tf.function def matmul_vectorized(args): A, B = args return tf.matmul(A, B) # [N, D, M, M] --> [N] # each term is sum_{d=1}^D Tr[M, M] # arg: [D, M, M], needs to be squared @tf.function def sum_d_trace(arg): trace_D = tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), arg) return tf.reduce_sum(trace_D) # trace of a matrix @tf.function def trace_tf(A): return tf.reduce_sum(tf.linalg.diag_part(A)) Y = self.data # specify qXp, the variational distribution q(X): each x_n is independent w/ N(x_n | \mu_n, S_n) # \mu_n \in R^q given by each row of `X_data_mean` # S_n \in R^qxq diagonal, so equivalently given by each row of `X_data_var` qXp = DiagonalGaussian(self.Xp_mean, self.Xp_var) # if split space, specify qXs # compute psi statistics for the shared space, keep the original shape of psi statistics, use qXs and kernel_s # psi0s is N-vector # psi1s is [N, M] # psi2s is [N, M, M] # also compute the covariance matrix Kuu for the shared space if self.split_space: qXs = DiagonalGaussian(self.Xs_mean, self.Xs_var) psi0s = expectation(qXs, self.kernel_s) psi1s = expectation(qXs, (self.kernel_s, self.Zs)) psi2s = expectation(qXs, (self.kernel_s, self.Zs), (self.kernel_s, self.Zs)) cov_uu_s = covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) # loop over k, for each k use kernel_K[k] and qXp, compute psi0k, psi1k, psi2k, then store the psi statistics for all k together # for each k: if no shared space, then psi0[:, k] = psi0k, psi1[:, :, k] = psi1k, psi2[:, :, :, k] = psi2k # if have shared space, then psi0[:, k] = psi0s + psi0k, psi1[:, :, k] = psi1s + psi1k # psi2[:, :, :, k] = psi2s + psi2k (the cross terms are added later) # then, for each n, psi2[n, :, :, k] = psi1s[n, :]^T dot psi1k[n, :] + psi1k[n, :]^T dot psi1s[n, :] (both are [M, M]) # psi0 is [N, K] so psi0[n, k] gives a real value # psi1 is [N, M, K], so psi1[n, :, k] gives us a M-vector # psi2 is [N, M, M, K], so psi2[n, :, :, k] gives us a [M x M] matrix psi0 = [] psi1 = [] psi2 = [] for k, kernel_k in enumerate(self.kernel_K): psi0k = expectation(qXp, kernel_k) psi1k = expectation(qXp, (kernel_k, self.Zp)) psi2k = expectation(qXp, (kernel_k, self.Zp), (kernel_k, self.Zp)) if self.split_space: psi0.append(psi0s + psi0k) psi1.append(psi1s + psi1k) # add the cross-covariance terms, require computation separately for each n sxk = tf.vectorized_map(row_outer_product, (psi1s, psi1k)) kxs = tf.vectorized_map(row_outer_product, (psi1k, psi1s)) psi2.append(psi2s + psi2k + sxk + kxs) else: psi0.append(psi0k) psi1.append(psi1k) psi2.append(psi2k) psi0 = tf.stack(psi0, axis=-1) psi1 = tf.stack(psi1, axis=-1) psi2 = tf.stack(psi2, axis=-1) # make K cov_uu_k using Zp and kernel_k # K cholesky, repeat N times for later use # L is [N x M x M x K] # these are the Kuu matrices L = [] for k, kernel_k in enumerate(self.kernel_K): cov_uu_k = covariances.Kuu(self.Zp, kernel_k, jitter=default_jitter()) if self.split_space: L.append(tf.linalg.cholesky(cov_uu_s + cov_uu_k)) else: L.append(tf.linalg.cholesky(cov_uu_k)) L = tf.stack(L, axis=-1) L = repeat_N(L) sigma2 = self.likelihood.variance # self.pred_Y = [] # use `tf.vectorized_map` to avoid writing a loop over N, but it requires every matrix to have N on axis 0 # so we need to repeat certain matrices that are the same for all N (e.g. L) # note we can use `tf.vectorized_map` because the computations are decomposable for each n, # i.e. they can be computed in any order over n Fq = [] Yn2 = tf.reduce_sum(tf.square(Y), axis=1) for k in range(self.K): # compute intermediate matrices for easier computation involving \inv{Kuu} # A is the same as AAT in gplvm, transposing L is the correct thing to do # but the two end up being the same since we only care about the trace tmp = tf.vectorized_map(triang_solve, (L[..., k], psi2[..., k])) # [N, M, M] A = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp)) # \inv{Kuu} * Psi2: [N, M, M] #pos_def = tf.vectorized_map(lambda x: is_pos_def(x), psi2[..., k]) #print(np.all(pos_def)) # psi2 is not produced using w/ `covariances.Kuu`, but it should still be PD # we should add jitter before doing cholesky #jitter_mtx = default_jitter() * tf.eye(self.M, dtype=default_float()) jitter_mtx = 1e-10 * tf.eye(self.M, dtype=default_float()) LB = tf.vectorized_map(lambda x: tf.linalg.cholesky(x + jitter_mtx), psi2[..., k]) # [N, M, M] tmp1 = tf.vectorized_map(triang_solve, (L[..., k], LB)) # [N, M, M] C = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp1)) # sqrt(\inv{Kuu} * Psi2 * \inv{Kuu}): [N, M, M] D = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_mu[k])), C)) # sqrt(M^T * \inv{Kuu} * Psi2 * \inv{Kuu} * M): [N, D, M] tmp2 = tf.vectorized_map(triang_solve, (L[..., k], repeat_N(self.q_mu[k]))) E = tf.vectorized_map(triang_solve_transpose, (L[..., k], tmp2)) # \inv{Kuu} * M: [N, M, D] # q_sqrt is already the cholesky F = tf.vectorized_map(matmul_vectorized, (repeat_N(tf.transpose(self.q_sqrt[k], perm=[0, 2, 1])), C)) # sqrt(S * \inv{Kuu} * Psi2 * \inv{Kuu}): [N, D, M, M] tmp3 = tf.vectorized_map(row_outer_product, (Y, psi1[..., k])) # Y^T * Psi1: [N, D, M] G = tf.vectorized_map(matmul_vectorized, (tmp3, E)) # Y^T * Psi1 * \inv{Kuu} * M: [N, D, D] # for debugging # self.pred_Y.append(tf.reshape(tf.vectorized_map(matmul_vectorized, (tf.expand_dims(psi1[..., k], 1), E)), (self.N, self.D))) # Psi1 * \inv{Kuu} * M: [N, D] # compute the lower bound # each term added here is length-N vector, each entry representing \sum_{d=1}^D Fdnk for a particular n, k Fnk = -0.5 * Yn2 / sigma2 Fnk += tf.vectorized_map(lambda x: trace_tf(x), G) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: tf.reduce_sum(tf.square(x)), D) / sigma2 Fnk += 0.5 * self.D * tf.vectorized_map(lambda x: trace_tf(x), A) / sigma2 Fnk += -0.5 * tf.vectorized_map(lambda x: sum_d_trace(x), F) / sigma2 Fq.append(Fnk) Fq = tf.stack(Fq, axis=-1) # [N, K] # psi0 is already [N, K] Fq += -0.5 * self.D * psi0 / sigma2 Fq += -0.5 * self.D * tf.math.log(2 * np.pi * sigma2) # for debugging #self.Fq = Fq # self.pred_Y = tf.stack(self.pred_Y, axis=-1) # [N, D, K] # weight each entry by the mixture responsibility, then sum over N, K bound = tf.reduce_sum(Fq * self.pi) # compute KL KL_p = self.kl_mvn(self.Xp_mean, self.Xp_var, self.Xp_prior_mean, self.Xp_prior_var) KL_c = self.kl_categorical(self.pi, self.pi_prior) KL_u = 0 prior_Kuu = np.zeros((self.M, self.M)) if self.split_space: KL_s = self.kl_mvn(self.Xs_mean, self.Xs_var, self.Xs_prior_mean, self.Xs_prior_var) bound += - KL_s prior_Kuu += covariances.Kuu(self.Zs, self.kernel_s, jitter=default_jitter()) for k in range(self.K): prior_Kuu_k = covariances.Kuu(self.Zp, self.kernel_K[k], jitter=default_jitter()) KL_u += kullback_leiblers.gauss_kl(q_mu=self.q_mu[k], q_sqrt=self.q_sqrt[k], K=prior_Kuu+prior_Kuu_k) bound += - KL_p - KL_u - KL_c return bound
def approx_conditional_ldf( Xnew, inducing_variable, kernel, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=True, ): """ - Xnew are the points of the data or minibatch, size N x D (tf.array, 2d) - inducing_variable is an instance of inducing_variables.InducingVariable that provides `Kuu` and `Kuf` methods for Laplacian Dirichlet features, this contains the limits of the bounding box and the frequencies - remainder_variable is another instance of inducing_variables.InducingVariable that specifies the high frequency components not selected in inducing_variable. - f is the value (or mean value) of the features (i.e. the weights) - q_sqrt (default None) is the Cholesky factor of the uncertainty about f (to be propagated through the conditional as per the GPflow inducing-point implementation) - white (defaults False) specifies whether the whitening has been applied. LDF works a lot better, when using vanilla gradients, if whitening has been applied, so it's the default option. Given the GP represented by the inducing points specified in `inducing_variable`, produce the mean and (co-)variance of the GP at the points Xnew. Xnew :: N x D Kuu :: M x M Kuf :: M x N f :: M x K, K = 1 q_sqrt :: K x M x M, with K = 1 """ if full_output_cov: raise NotImplementedError # num_data = tf.shape(Xnew)[0] # M num_func = tf.shape(f)[1] # K Λ = cov.Kuu(inducing_variable, kernel) # this is now a LinearOperator Φ = cov.Kuf(inducing_variable, kernel, Xnew) # still a Tensor Λr = cov.Kuu(inducing_variable.remainder, kernel) Φr = cov.Kuf(inducing_variable.remainder, kernel, Xnew) # compute the covariance due to the conditioning if full_cov: fvar = tf.matmul(Φr, Λr.solve(Φr), transpose_a=True) shape = (num_func, 1, 1) else: fvar = tf.reduce_sum(Φr * Λr.solve(Φr), -2) shape = (num_func, 1) fvar = tf.expand_dims(fvar, 0) * tf.ones( shape, dtype=gpflow.default_float()) # K x N x N or K x N # another backsubstitution in the unwhitened case if white: A = Λ.cholesky().solve(Φ) else: A = Λ.solve(Φ) # construct the conditional mean fmean = tf.matmul(A, f, transpose_a=True) if q_sqrt is not None: if q_sqrt.shape.ndims == 2: # case for q_diag = True LTA = Diag(q_sqrt) @ A # K x M x N elif q_sqrt.shape.ndims == 3: LTA = tf.matmul(q_sqrt, A, transpose_a=True) else: raise ValueError("Bad dimension for q_sqrt: %s" % str(q_sqrt.get_shape().ndims)) if full_cov: fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True) # K x N x N else: fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # K x N fvar = tf.transpose(fvar) # N x K or N x N x K return fmean, fvar
def prior_kl_ldf(inducing_variable, kernel, q_mu, q_sqrt, whiten=False): if whiten: K = None else: K = cov.Kuu(inducing_variable, kernel) return gauss_kl_ldf(q_mu, q_sqrt, K)
def elbo(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. """ Y_data = self.data X_data_mean, X_data_var = self.encoder(Y_data) pX = DiagonalGaussian(X_data_mean, X_data_var) num_inducing = self.inducing_variable.num_inducing psi0 = tf.reduce_sum(expectation(pX, self.kernel)) psi1 = expectation(pX, (self.kernel, self.inducing_variable)) psi2 = tf.reduce_sum( expectation(pX, (self.kernel, self.inducing_variable), (self.kernel, self.inducing_variable)), axis=0) cov_uu = covariances.Kuu(self.inducing_variable, self.kernel, jitter=default_jitter()) L = tf.linalg.cholesky(cov_uu) sigma2 = self.likelihood.variance sigma = tf.sqrt(sigma2) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, tf.transpose(psi1), lower=True) / sigma tmp = tf.linalg.triangular_solve(L, psi2, lower=True) AAT = tf.linalg.triangular_solve(L, tf.transpose(tmp), lower=True) / sigma2 B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) log_det_B = 2.0 * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) c = tf.linalg.triangular_solve( LB, tf.linalg.matmul(A, Y_data), lower=True) / sigma # KL[q(x) || p(x)] dX_data_var = (X_data_var if X_data_var.shape.ndims == 2 else tf.linalg.diag_part(X_data_var)) NQ = to_default_float(tf.size(X_data_mean)) D = to_default_float(tf.shape(Y_data)[1]) KL = -0.5 * tf.reduce_sum(tf.math.log(dX_data_var)) KL += 0.5 * tf.reduce_sum(tf.math.log(self.X_prior_var)) KL -= 0.5 * NQ KL += 0.5 * tf.reduce_sum( (tf.square(X_data_mean - self.X_prior_mean) + dX_data_var) / self.X_prior_var) self.loss_placeholder["KL_x"].append(KL.numpy()) # compute log marginal bound ND = to_default_float(tf.size(Y_data)) bound = -0.5 * ND * tf.math.log(2 * np.pi * sigma2) bound += -0.5 * D * log_det_B bound += -0.5 * tf.reduce_sum(tf.square(Y_data)) / sigma2 bound += 0.5 * tf.reduce_sum(tf.square(c)) bound += -0.5 * D * (tf.reduce_sum(psi0) / sigma2 - tf.reduce_sum(tf.linalg.diag_part(AAT))) bound -= KL self.loss_placeholder["ELBO"].append(bound.numpy()) return bound