def test_multi_scale_inducing_equivalence_inducing_points(N, M, D): # Multiscale must be equivalent to inducing points when variance is zero Xnew, Z = np.random.randn(N, D), np.random.randn(M, D) rbf = gpflow.kernels.SquaredExponential(1.3441, lengthscale=np.random.uniform( 0.5, 3., D)) inducing_variable_zero_lengthscale = Multiscale(Z, scales=np.zeros(Z.shape)) inducing_variable_inducing_point = InducingPoints(Z) multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscale, rbf, Xnew) inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew) deviation_percent_Kuf = np.max( np.abs(multi_scale_Kuf - inducing_point_Kuf) / inducing_point_Kuf * 100) assert deviation_percent_Kuf < 0.1 multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscale, rbf) inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf) deviation_percent_Kuu = np.max( np.abs(multi_scale_Kuu - inducing_point_Kuu) / inducing_point_Kuu * 100) assert deviation_percent_Kuu < 0.1
def test_inducing_variables_psd_schur(input_dim, inducing_variable, kernel): # Conditional variance must be PSD. X = np.random.randn(5, input_dim) Kuf_values = Kuf(inducing_variable, kernel, X) Kuu_values = Kuu(inducing_variable, kernel, jitter=default_jitter()) Kff_values = kernel(X) Qff_values = Kuf_values.numpy().T @ np.linalg.solve(Kuu_values, Kuf_values) assert np.all(np.linalg.eig(Kff_values - Qff_values)[0] > 0.0)
def upper_bound(self) -> tf.Tensor: """ Upper bound for the sparse GP regression marginal likelihood. Note that the same inducing points are used for calculating the upper bound, as are used for computing the likelihood approximation. This may not lead to the best upper bound. The upper bound can be tightened by optimising Z, just like the lower bound. This is especially important in FITC, as FITC is known to produce poor inducing point locations. An optimisable upper bound can be found in https://github.com/markvdw/gp_upper. The key reference is :: @misc{titsias_2014, title={Variational Inference for Gaussian and Determinantal Point Processes}, url={http://www2.aueb.gr/users/mtitsias/papers/titsiasNipsVar14.pdf}, publisher={Workshop on Advances in Variational Inference (NIPS 2014)}, author={Titsias, Michalis K.}, year={2014}, month={Dec} } The key quantity, the trace term, can be computed via >>> _, v = conditionals.conditional(X, model.inducing_variable.Z, model.kernel, ... np.zeros((len(model.inducing_variable), 1))) which computes each individual element of the trace term. """ X_data, Y_data = self.data num_data = to_default_float(tf.shape(Y_data)[0]) Kdiag = self.kernel(X_data, full_cov=False) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) kuf = Kuf(self.inducing_variable, self.kernel, X_data) I = tf.eye(tf.shape(kuu)[0], dtype=default_float()) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) AAT = tf.linalg.matmul(A, A, transpose_b=True) B = I + AAT / self.likelihood.variance LB = tf.linalg.cholesky(B) # Using the Trace bound, from Titsias' presentation c = tf.maximum(tf.reduce_sum(Kdiag) - tf.reduce_sum(tf.square(A)), 0) # Alternative bound on max eigenval: corrected_noise = self.likelihood.variance + c const = -0.5 * num_data * tf.math.log(2 * np.pi * self.likelihood.variance) logdet = -tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) LC = tf.linalg.cholesky(I + AAT / corrected_noise) v = tf.linalg.triangular_solve(LC, tf.linalg.matmul(A, Y_data) / corrected_noise, lower=True) quad = -0.5 * tf.reduce_sum(tf.square(Y_data)) / corrected_noise + 0.5 * tf.reduce_sum(tf.square(v)) return const + logdet + quad
def conditional_ND(self, X, full_cov=False): # X is [S,N,D] Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) Lmm = tf.linalg.cholesky(Kmm) Kmm_tiled = tf.tile(tf.expand_dims(Kmm, 0), (self.num_outputs, 1, 1)) Lmm_tiled = tf.tile(tf.expand_dims(Lmm, 0), (self.num_outputs, 1, 1)) Kmn = Kuf(self.inducing_points, self.kernel, X) # K(Z,X) # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function. mean = tf.matmul(A, self.q_mu, transpose_a=True) # [N] # [D_out,M,N] A_tiled = tf.tile(A[None, :, :], [self.num_outputs, 1, 1]) I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :] # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: # -k(Z,Z) SK = -Kmm_tiled # [D_out,M,M] if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T # [D_out,M,M] SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A_tiled) # [D_out,M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) # [D_out,N,N] # Knn = k(X,X) Knn = self.kernel.K(X) else: # Summing over dimension 1 --> sum variances due to other. # Is this legit? delta_cov = tf.reduce_sum(A_tiled * B, 1) #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B, # transpose_a=True)) # [D_out,N] Knn = self.kernel.K_diag(X) # [N] var = tf.expand_dims(Knn, 0) + delta_cov # [D_out,N] var = tf.transpose(var) return mean + self.mean_function(X), var
def predict_f(self, Xnew: InputData, full_cov=False, full_output_cov=False) -> MeanAndVariance: """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ X_data, Y_data = self.data num_inducing = len(self.inducing_variable) err = Y_data - self.mean_function(X_data) kuf = Kuf(self.inducing_variable, self.kernel, X_data) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) Kus = Kuf(self.inducing_variable, self.kernel, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.linalg.cholesky(kuu) A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma B = tf.linalg.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.linalg.triangular_solve(L, Kus, lower=True) tmp2 = tf.linalg.triangular_solve(LB, tmp1, lower=True) mean = tf.linalg.matmul(tmp2, c, transpose_a=True) if full_cov: var = ( self.kernel(Xnew) + tf.linalg.matmul(tmp2, tmp2, transpose_a=True) - tf.linalg.matmul(tmp1, tmp1, transpose_a=True) ) var = tf.tile(var[None, ...], [self.num_latent_gps, 1, 1]) # [P, N, N] else: var = ( self.kernel(Xnew, full_cov=False) + tf.reduce_sum(tf.square(tmp2), 0) - tf.reduce_sum(tf.square(tmp1), 0) ) var = tf.tile(var[:, None], [1, self.num_latent_gps]) return mean + self.mean_function(Xnew), var
def _conditional_train( Xnew: tf.Tensor, inducing_variable: InducingVariables, kernel: Kernel, f: tf.Tensor, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False, ): """ Single-output GP conditional. The covariance matrices used to calculate the conditional have the following shape: - Kuu: [M, M] - Kuf: [M, N] - Kff: [N, N] Further reference ----------------- - See `gpflow.conditionals._conditional` (below) for a detailed explanation of conditional in the single-output case. - See the multiouput notebook for more information about the multiouput framework. Parameters ---------- :param Xnew: data matrix, size [N, D]. :param f: data matrix, [M, R] :param full_cov: return the covariance between the datapoints :param full_output_cov: return the covariance between the outputs. NOTE: as we are using a single-output kernel with repetitions these covariances will be zero. :param q_sqrt: matrix of standard-deviations or Cholesky matrices, size [M, R] or [R, M, M]. :param white: boolean of whether to use the whitened representation :return: - mean: [N, R] - variance: [N, R], [R, N, N], [N, R, R] or [N, R, N, R] Please see `gpflow.conditional._expand_independent_outputs` for more information about the shape of the variance, depending on `full_cov` and `full_output_cov`. """ Kmm = Kuu(inducing_variable, kernel, jitter=default_jitter()) # [M, M] Kmn = Kuf(inducing_variable, kernel, Xnew) # [M, N] Knn = kernel.diag_tr() #uses optimzied function to calculate the covariances fmean, fvar = base_conditional( Kmn, Kmm, Knn, f, full_cov=full_cov, q_sqrt=q_sqrt, white=white ) # [N, R], [R, N, N] or [N, R] return fmean, expand_independent_outputs(fvar, full_cov, full_output_cov)
def test_multi_scale_inducing_equivalence_inducing_points(N, M, D): # Multiscale must be equivalent to inducing points when variance is zero Xnew, Z = np.random.randn(N, D), np.random.randn(M, D) rbf = gpflow.kernels.SquaredExponential(1.3441, lengthscales=np.random.uniform( 0.5, 3.0, D)) inducing_variable_zero_lengthscales = Multiscale(Z, scales=np.zeros(Z.shape) + 1e-10) inducing_variable_inducing_point = InducingPoints(Z) multi_scale_Kuf = Kuf(inducing_variable_zero_lengthscales, rbf, Xnew) inducing_point_Kuf = Kuf(inducing_variable_inducing_point, rbf, Xnew) relative_error_Kuf = np.abs(multi_scale_Kuf - inducing_point_Kuf) / inducing_point_Kuf assert np.max(relative_error_Kuf) < 0.1e-2 # 0.1 % multi_scale_Kuu = Kuu(inducing_variable_zero_lengthscales, rbf) inducing_point_Kuu = Kuu(inducing_variable_inducing_point, rbf) relative_error_Kuu = np.abs(multi_scale_Kuu - inducing_point_Kuu) / inducing_point_Kuu assert np.max(relative_error_Kuu) < 0.1e-2 # 0.1 %
def __call__(self, X: TensorType) -> tf.Tensor: """ :param X: evaluation points [N, D] :return: function value of sample [N, P] """ N = tf.shape(X)[0] phi_X = kernel.feature_functions(X) # [N, L] weight_space_prior_X = phi_X @ prior_weights # [N, 1] Knm = tf.linalg.matrix_transpose(Kuf(inducing_variable, kernel, X)) # [N, M] function_space_update_X = Knm @ v # [N, P] tf.debugging.assert_equal(tf.shape(weight_space_prior_X), [N, 1]) tf.debugging.assert_equal(tf.shape(function_space_update_X), [N, P]) return weight_space_prior_X + function_space_update_X # [N, P]
def conditional(self, X, full_cov=False): # X is [N,D] or [S*N,D] Kmm = Kuu(self.inducing_points, self.kernel, jitter=default_jitter()) #[M,M] Lmm = tf.linalg.cholesky(Kmm) Kmn = Kuf(self.inducing_points, self.kernel, X) #[M,N] # alpha(X) = k(Z,Z)^{-1}k(Z,X), = L^{-T}L^{-1}k(Z,X) A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) mean = tf.matmul(A, self.q_mu-self.mean_function(self.inducing_points.Z), transpose_a=True) # [N,1] I = tf.eye(self.num_inducing, dtype=default_float()) # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: SK = -Kmm if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A) #[M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A, B, transpose_a=True) # [N,N] Knn = self.kernel(X, full_cov=True, presliced=False) else: delta_cov = tf.reduce_sum(A * B, 0) Knn = self.kernel(X, full_cov=False, presliced=False) var = Knn + delta_cov var = tf.transpose(var) return mean + self.mean_function(X), var
def _compute_robust_maximum_log_likelihood_objective(self) -> tf.Tensor: """ Construct a tensorflow function to compute the bound on the marginal likelihood. For a derivation of the terms in here, see the associated SGPR notebook. """ X_data, Y_data = self.data num_inducing = len(self.inducing_variable) num_data = to_default_float(tf.shape(Y_data)[0]) output_dim = to_default_float(tf.shape(Y_data)[1]) err = Y_data - self.mean_function(X_data) Kdiag = self.kernel(X_data, full_cov=False) kuf = Kuf(self.inducing_variable, self.kernel, X_data) kuu = Kuu(self.inducing_variable, self.kernel, jitter=self.jitter_variance) L = tf.linalg.cholesky(kuu) sigma = tf.sqrt(self.likelihood.variance) # Compute intermediate matrices A = tf.linalg.triangular_solve(L, kuf, lower=True) / sigma AAT = tf.linalg.matmul(A, A, transpose_b=True) B = AAT + tf.eye(num_inducing, dtype=default_float()) LB = tf.linalg.cholesky(B) Aerr = tf.linalg.matmul(A, err) c = tf.linalg.triangular_solve(LB, Aerr, lower=True) / sigma trace_term = 0.5 * output_dim * tf.reduce_sum(Kdiag) / self.likelihood.variance trace_term -= 0.5 * output_dim * tf.reduce_sum(tf.linalg.diag_part(AAT)) # tr(Kff - Qff) should be positive, numerical issues can arise here assert trace_term > 0.0, f"Trace term negative, should be positive ({trace_term:.4e})." # compute log marginal bound bound = -0.5 * num_data * output_dim * np.log(2 * np.pi) bound += tf.negative(output_dim) * tf.reduce_sum(tf.math.log(tf.linalg.diag_part(LB))) bound -= 0.5 * num_data * output_dim * tf.math.log(self.likelihood.variance) bound += -0.5 * tf.reduce_sum(tf.square(err)) / self.likelihood.variance bound += 0.5 * tf.reduce_sum(tf.square(c)) bound -= trace_term return bound
def conditional_ND(self, X, full_cov=False): # X is [S,N,D] Kmm_tiled = tf.convert_to_tensor([ Kuu(self.inducing_points[i], self.kernels[i], jitter=default_jitter()) for i in range(self.num_outputs) ]) Lmm_tiled = tf.convert_to_tensor([ tf.linalg.cholesky(Kmm_tiled[i]) for i in range(self.num_outputs) ]) A_tiled = [] mean_tiled = [] for i in range(self.num_outputs): Kmn = Kuf(self.inducing_points[i], self.kernels[i], X) Lmm = Lmm_tiled[i] A = tf.linalg.triangular_solve(Lmm, Kmn, lower=True) # L^{-1}k(Z,X) if not self.white: # L^{-T}L^{-1}K(Z,X) is [M,N] A = tf.linalg.triangular_solve(tf.transpose(Lmm), A, lower=False) # m = alpha(X)^T(q_mu - m(Z)) = alpha(X)^T(q_mu) if zero mean function. mean = tf.linalg.matvec(A, self.q_mu[:, i], transpose_a=True) # [N] A_tiled.append(A) mean_tiled.append(mean) A_tiled = tf.convert_to_tensor(A_tiled) mean_tiled = tf.transpose(tf.convert_to_tensor(mean_tiled)) I = tf.eye(self.num_inducing, dtype=default_float())[None, :, :] # var = k(X,X) - alpha(X)^T(k(Z,Z)-q_sqrtq_sqrt^T)alpha(X) if self.white: SK = -I else: # -k(Z,Z) SK = -Kmm_tiled # [D_out,M,M] if self.q_sqrt is not None: # SK = -k(Z,Z) + q_sqrtq_sqrt^T # [D_out,M,M] SK += tf.matmul(self.q_sqrt, self.q_sqrt, transpose_b=True) # B = -(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) B = tf.matmul(SK, A_tiled) # [D_out,M,N] if full_cov: # delta_cov = -alpha(X)^T(k(Z,Z) - q_sqrtq_sqrt^T)alpha(X) delta_cov = tf.matmul(A_tiled, B, transpose_a=True) # [D_out,N,N] # Knn = k(X,X) Knn = tf.convert_to_tensor( [self.kernels[i].K(X) for i in range(self.num_outputs)]) else: # Summing over dimension 1 --> sum variances due to other. # Is this legit? delta_cov = tf.reduce_sum(A_tiled * B, 1) #delta_cov = tf.linalg.diag_part(tf.matmul(A_tiled, B, # transpose_a=True)) # [D_out,N] Knn = tf.convert_to_tensor( [self.kernels[i].K_diag(X) for i in range(self.num_outputs)]) var = Knn + delta_cov # [D_out,N] var = tf.transpose(var) return mean_tiled + self.mean_function(X), var