def compute_variance_of_points(self, points_to_sample): r"""Compute the variance (matrix) of this GP at each point of ``Xs`` (``points_to_sample``). .. Warning:: ``points_to_sample`` should not contain duplicate points. The variance matrix is symmetric although we currently return the full representation. .. Note:: Comments are copied from :mod:`moe.optimal_learning.python.interfaces.gaussian_process_interface.GaussianProcessInterface.compute_variance_of_points` :param points_to_sample: num_to_sample points (in dim dimensions) being sampled from the GP :type points_to_sample: array of float64 with shape (num_to_sample, dim) :return: var_star: variance matrix of this GP :rtype: array of float64 with shape (num_to_sample, num_to_sample) """ var_star = python_utils.build_covariance_matrix(self._covariance, points_to_sample) # this is K_star_star if self.num_sampled == 0: return numpy.diag(numpy.diag(var_star)) K_star = python_utils.build_mix_covariance_matrix( self._covariance, self._points_sampled, points_to_sample, ) V = scipy.linalg.solve_triangular( self._K_chol[0], K_star, lower=self._K_chol[1], overwrite_b=True, ) # cheaper to go through scipy.linalg.get_blas_funcs() which can compute A = alpha*B*C + beta*A in one pass var_star -= numpy.dot(V.T, V) return var_star
def _build_precomputed_data(self): """Set up precomputed data (cholesky factorization of K and K^-1 * y).""" if self.num_sampled == 0: self._K_chol = numpy.array([]) self._K_inv_y = numpy.array([]) else: covariance_matrix = python_utils.build_covariance_matrix( self._covariance, self._points_sampled, noise_variance=self._points_sampled_noise_variance, ) self._K_chol = scipy.linalg.cho_factor(covariance_matrix, lower=True, overwrite_a=True) self._K_inv_y = scipy.linalg.cho_solve(self._K_chol, self._points_sampled_value)
def compute_grad_log_likelihood(self): r"""Compute the gradient (wrt hyperparameters) of the _log_likelihood_type measure at the specified hyperparameters. .. NOTE:: These comments are copied from LogMarginalLikelihoodEvaluator::ComputeGradLogLikelihood in gpp_model_selection.cpp. Computes ``\pderiv{log(p(y | X, \theta))}{\theta_k} = \frac{1}{2} * y_i * \pderiv{K_{ij}}{\theta_k} * y_j - \frac{1}{2}`` ``* trace(K^{-1}_{ij}\pderiv{K_{ij}}{\theta_k})`` Or equivalently, ``= \frac{1}{2} * trace([\alpha_i \alpha_j - K^{-1}_{ij}]*\pderiv{K_{ij}}{\theta_k})``, where ``\alpha_i = K^{-1}_{ij} * y_j`` :return: grad_log_likelihood: i-th entry is ``\pderiv{LL(y | X, \theta)}{\theta_i}`` :rtype: array of float64 with shape (num_hyperparameters) """ covariance_matrix = python_utils.build_covariance_matrix( self._covariance, self._points_sampled, noise_variance=self._points_sampled_noise_variance, ) K_chol = scipy.linalg.cho_factor(covariance_matrix, lower=True, overwrite_a=True) K_inv_y = scipy.linalg.cho_solve(K_chol, self._points_sampled_value) grad_hyperparameter_cov_matrix = python_utils.build_hyperparameter_grad_covariance_matrix( self._covariance, self._points_sampled, ) grad_log_marginal = numpy.empty(self.num_hyperparameters) for k in xrange(self.num_hyperparameters): grad_cov_block = grad_hyperparameter_cov_matrix[..., k] # computing 0.5 * \alpha^T * grad_hyperparameter_cov_matrix * \alpha, where \alpha = K^-1 * y (aka K_inv_y) # temp_vec := grad_hyperparameter_cov_matrix * K_inv_y temp_vec = numpy.dot(grad_cov_block, K_inv_y) # computes 0.5 * K_inv_y^T * temp_vec grad_log_marginal[k] = 0.5 * numpy.dot(K_inv_y, temp_vec) # compute -0.5 * tr(K^-1 * dK/d\theta) temp = scipy.linalg.cho_solve(K_chol, grad_cov_block, overwrite_b=True) grad_log_marginal[k] -= 0.5 * temp.trace() # TODO(GH-180): this can be much faster if we form K^-1 explicitly (see below), but that is less accurate # grad_log_marginal[k] -= 0.5 * numpy.einsum('ij,ji', K_inv, grad_cov_block) return grad_log_marginal
def _build_precomputed_data(self): """Set up precomputed data (cholesky factorization of K and K^-1 * y).""" if self.num_sampled == 0: self._K_chol = numpy.array([]) self._K_inv_y = numpy.array([]) else: covariance_matrix = python_utils.build_covariance_matrix( self._covariance, self._points_sampled, noise_variance=self._points_sampled_noise_variance, ) C = self._build_integrated_term_maxtrix(self._covariance, self._points_sampled) self._K_Inv = numpy.linalg.inv(covariance_matrix) self._K_C = numpy.empty((covariance_matrix.shape[0],covariance_matrix.shape[0])) self._K_C = numpy.multiply(C, self._K_Inv) self._K_chol = scipy.linalg.cho_factor(covariance_matrix, lower=True, overwrite_a=True) self._K_inv_y = scipy.linalg.cho_solve(self._K_chol, self._points_sampled_value) self._marginal_mean_mat = self._build_marginal_matrix_mean() self._marginal_mean_mat_gradient = self._build_marginal_matrix_mean_gradient()
def compute_log_likelihood(self): r"""Compute the _log_likelihood_type measure at the specified hyperparameters. .. NOTE:: These comments are copied from LogMarginalLikelihoodEvaluator::ComputeLogLikelihood in gpp_model_selection.cpp. ``log p(y | X, \theta) = -\frac{1}{2} * y^T * K^-1 * y - \frac{1}{2} * \log(det(K)) - \frac{n}{2} * \log(2*pi)`` where n is ``num_sampled``, ``\theta`` are the hyperparameters, and ``\log`` is the natural logarithm. In the following, ``term1 = -\frac{1}{2} * y^T * K^-1 * y`` ``term2 = -\frac{1}{2} * \log(det(K))`` ``term3 = -\frac{n}{2} * \log(2*pi)`` For an SPD matrix ``K = L * L^T``, ``det(K) = \Pi_i L_ii^2`` We could compute this directly and then take a logarithm. But we also know: ``\log(det(K)) = 2 * \sum_i \log(L_ii)`` The latter method is (currently) preferred for computing ``\log(det(K))`` due to reduced chance for overflow and (possibly) better numerical conditioning. :return: value of log_likelihood evaluated at hyperparameters (``LL(y | X, \theta)``) :rtype: float64 """ covariance_matrix = python_utils.build_covariance_matrix( self._covariance, self._points_sampled, noise_variance=self._points_sampled_noise_variance, ) K_chol = scipy.linalg.cho_factor(covariance_matrix, lower=True, overwrite_a=True) log_marginal_term2 = -numpy.log(K_chol[0].diagonal()).sum() K_inv_y = scipy.linalg.cho_solve(K_chol, self._points_sampled_value) log_marginal_term1 = -0.5 * numpy.inner(self._points_sampled_value, K_inv_y) log_marginal_term3 = -0.5 * numpy.float64( self._points_sampled_value.size) * numpy.log(2.0 * numpy.pi) return log_marginal_term1 + log_marginal_term2 + log_marginal_term3
def compute_log_likelihood(self): r"""Compute the _log_likelihood_type measure at the specified hyperparameters. .. NOTE:: These comments are copied from LogMarginalLikelihoodEvaluator::ComputeLogLikelihood in gpp_model_selection.cpp. ``log p(y | X, \theta) = -\frac{1}{2} * y^T * K^-1 * y - \frac{1}{2} * \log(det(K)) - \frac{n}{2} * \log(2*pi)`` where n is ``num_sampled``, ``\theta`` are the hyperparameters, and ``\log`` is the natural logarithm. In the following, ``term1 = -\frac{1}{2} * y^T * K^-1 * y`` ``term2 = -\frac{1}{2} * \log(det(K))`` ``term3 = -\frac{n}{2} * \log(2*pi)`` For an SPD matrix ``K = L * L^T``, ``det(K) = \Pi_i L_ii^2`` We could compute this directly and then take a logarithm. But we also know: ``\log(det(K)) = 2 * \sum_i \log(L_ii)`` The latter method is (currently) preferred for computing ``\log(det(K))`` due to reduced chance for overflow and (possibly) better numerical conditioning. :return: value of log_likelihood evaluated at hyperparameters (``LL(y | X, \theta)``) :rtype: float64 """ covariance_matrix = python_utils.build_covariance_matrix( self._covariance, self._points_sampled, noise_variance=self._points_sampled_noise_variance, ) K_chol = scipy.linalg.cho_factor(covariance_matrix, lower=True, overwrite_a=True) log_marginal_term2 = -numpy.log(K_chol[0].diagonal()).sum() K_inv_y = scipy.linalg.cho_solve(K_chol, self._points_sampled_value) log_marginal_term1 = -0.5 * numpy.inner(self._points_sampled_value, K_inv_y) log_marginal_term3 = -0.5 * numpy.float64(self._points_sampled_value.size) * numpy.log(2.0 * numpy.pi) return log_marginal_term1 + log_marginal_term2 + log_marginal_term3