def __call__(self, X, Y=None, eval_gradient=False): """Return the kernel k(X, Y) and optionally its gradient. Parameters ---------- X : array, shape (n_samples_X, n_features) Left argument of the returned kernel k(X, Y) Y : array, shape (n_samples_Y, n_features), (optional, default=None) Right argument of the returned kernel k(X, Y). If None, k(X, X) if evaluated instead. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel hyperparameter is determined. Only supported when Y is None. Returns ------- K : array, shape (n_samples_X, n_samples_Y) Kernel k(X, Y) K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims) The gradient of the kernel k(X, X) with respect to the hyperparameter of the kernel. Only returned when eval_gradient is True. """ prototypes_std = self.prototypes.std(0) n_prototypes = self.prototypes.shape[0] n_gradient_dim = n_prototypes + (0 if self.hyperparameter_gamma.fixed else 1) X = np.atleast_2d(X) if Y is not None and eval_gradient: raise ValueError("Gradient can only be evaluated when Y is None.") if Y is None: K = np.eye(X.shape[0]) * self.diag(X) if eval_gradient: K_gradient = np.zeros((K.shape[0], K.shape[0], n_gradient_dim)) K_pairwise = pairwise_kernels( self.prototypes / prototypes_std, X / prototypes_std, metric="rbf", gamma=self.gamma, ) for i in range(n_prototypes): for j in range(K.shape[0]): K_gradient[j, j, i] = (self.sigma_2[i] * K_pairwise[i, j] / K_pairwise[:, j].sum()) if not self.hyperparameter_gamma.fixed: # XXX: Analytic expression for gradient? def f(gamma): # helper function theta = self.theta.copy() theta[-1] = gamma[0] return self.clone_with_theta(theta)(X, Y) K_gradient[:, :, -1] = _approx_fprime([self.theta[-1]], f, 1e-5)[:, :, 0] return K, K_gradient else: return K else: K = np.zeros((X.shape[0], Y.shape[0])) return K # XXX: similar entries?
def __call__(self, XX1, XX2=None, eval_gradient=False): """Return the kernel k(XX1, XX2) and optionally its gradient. Parameters ---------- XX1 : array, shape (n_samples_XX1, n_features) Left argument of the returned kernel k(XX1, XX2) XX2 : array, shape (n_samples_XX2, n_features), (optional, default=None) Right argument of the returned kernel k(XX1, XX2). If None, k(XX1, XX1) is evaluated instead. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel hyperparameter is determined. Only supported when XX2 is None. Returns ------- K : array, shape (n_samples_XX1, n_samples_XX2) Kernel k(XX1, XX2) K_gradient : array (opt.), shape (n_samples_XX1, n_samples_XX1, n_dims) The gradient of the kernel k(XX1, XX1) with respect to the hyperparameter of the kernel. Only returned when eval_gradient is True. """ XX1 = np.atleast_2d(XX1) length_scale = _check_length_scale(XX1, self.length_scale) if XX2 is None: K = full_kernel(XX1, length_scale, self.n_XX_func, return_code=self.return_code) else: if eval_gradient: raise ValueError( "Gradient can only be evaluated when XX2 is None.") K = full_kernel(XX1, length_scale, self.n_XX_func, XX2, self.return_code) print(K.shape, 'KK') if not eval_gradient: return K if self.hyperparameter_length_scale.fixed: # Hyperparameter l kept fixed length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0)) else: # approximate gradient numerically def f(gamma): # helper function return full_kernel(XX1, gamma, self.n_XX_func, return_code=self.return_code) length_scale = np.atleast_1d(length_scale) length_scale_gradient = _approx_fprime(length_scale, f, 1e-8) return K, length_scale_gradient
def __call__(self, X, Y=None, eval_gradient=False): l_train = 10**self.gp_l.predict(X) # Prepare distances and length scale information for any pair of # datapoints, whose correlation shall be computed if Y is not None: # Get pairwise componentwise L1-differences to the input training # set d = Y[:, np.newaxis, :] - X[np.newaxis, :, :] d = d.reshape((-1, Y.shape[1])) # Predict length scales for query datapoints l_query = 10**self.gp_l.predict(Y) l = np.transpose([ np.tile(l_train, len(l_query)), np.repeat(l_query, len(l_train)) ]) else: # No external datapoints given; auto-correlation of training set # is used instead d = X[:, np.newaxis, :] - X[np.newaxis, :, :] d = d.reshape((-1, X.shape[1])) l = np.transpose([ np.tile(l_train, len(l_train)), np.repeat(l_train, len(l_train)) ]) # XXX: check # Compute general Matern kernel if d.ndim > 1 and self.theta_gp.size == d.ndim: activation = \ np.sum(self.theta_gp.reshape(1, d.ndim) * d ** 2, axis=1) else: activation = self.theta_gp[0] * np.sum(d**2, axis=1) tmp = 0.5 * (l**2).sum(1) tmp2 = np.maximum(2 * np.sqrt(self.nu * activation / tmp), 1e-5) k = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) \ / (gamma(self.nu) * 2**(self.nu - 1)) k /= np.sqrt(tmp) k *= tmp2**self.nu * kv(self.nu, tmp2) # Convert correlations to 2d matrix if Y is not None: return k.reshape(-1, X.shape[0]).T else: # exploit symmetry of auto-correlation K = k.reshape(X.shape[0], X.shape[0]) if not eval_gradient: return K else: # approximate gradient numerically # XXX: computed gradient analytically? def f(theta): # helper function return self.clone_with_theta(theta)(X, Y) return K, _approx_fprime(self.weights, f, 1e-7)
def __call__(self, X, Y=None, eval_gradient=False): l_train = 10 ** self.gp_l.predict(X) # Prepare distances and length scale information for any pair of # datapoints, whose correlation shall be computed if Y is not None: # Get pairwise componentwise L1-differences to the input training # set d = Y[:, np.newaxis, :] - X[np.newaxis, :, :] d = d.reshape((-1, Y.shape[1])) # Predict length scales for query datapoints l_query = 10 ** self.gp_l.predict(Y) l = np.transpose([np.tile(l_train, len(l_query)), np.repeat(l_query, len(l_train))]) else: # No external datapoints given; auto-correlation of training set # is used instead d = X[:, np.newaxis, :] - X[np.newaxis, :, :] d = d.reshape((-1, X.shape[1])) l = np.transpose([np.tile(l_train, len(l_train)), np.repeat(l_train, len(l_train))]) # XXX: check # Compute general Matern kernel if d.ndim > 1 and self.theta_gp.size == d.ndim: activation = \ np.sum(self.theta_gp.reshape(1, d.ndim) * d ** 2, axis=1) else: activation = self.theta_gp[0] * np.sum(d ** 2, axis=1) tmp = 0.5 * (l**2).sum(1) tmp2 = np.maximum(2*np.sqrt(self.nu * activation / tmp), 1e-5) k = np.sqrt(l[:, 0]) * np.sqrt(l[:, 1]) \ / (gamma(self.nu) * 2**(self.nu - 1)) k /= np.sqrt(tmp) k *= tmp2**self.nu * kv(self.nu, tmp2) # Convert correlations to 2d matrix if Y is not None: return k.reshape(-1, X.shape[0]).T else: # exploit symmetry of auto-correlation K = k.reshape(X.shape[0], X.shape[0]) if not eval_gradient: return K else: # approximate gradient numerically def f(theta): # helper function import copy # XXX: Avoid deepcopy kernel = copy.deepcopy(self) kernel.theta = theta return kernel(X) return K, _approx_fprime(self.params, f, 1e-5)
def __call__(self, XX1, XX2=None, eval_gradient=False): """Return the kernel k(XX1, XX2) and optionally its gradient. Parameters ---------- XX1 : array, shape (n_samples_XX1, n_features) Left argument of the returned kernel k(XX1, XX2) XX2 : array, shape (n_samples_XX2, n_features), (optional, default=None) Right argument of the returned kernel k(XX1, XX2). If None, k(XX1, XX1) is evaluated instead. Returns ------- K : array, shape (n_samples_XX1, n_samples_XX2) Kernel k(XX1, XX2) K_gradient : array (opt.), shape (n_samples_XX1, n_samples_XX1, n_dims) The gradient of the kernel k(XX1, XX1) with respect to the hyperparameter of the kernel. Only returned when eval_gradient is True. """ XX1 = np.atleast_2d(XX1) hyperparams = np.squeeze(self.length_scale).astype(float) if XX2 is None: K = full_multilevel_kernel(XX1, hyperparams, self.nsamples_per_model, self.return_code != 'full') else: if eval_gradient: raise ValueError( "Gradient can only be evaluated when XX2 is None.") K = full_multilevel_kernel_for_prediction(XX1, XX2, hyperparams, self.nsamples_per_model) if not eval_gradient: return K if self.hyperparameter_length_scale.fixed: # Hyperparameter l kept fixed length_scale_gradient = np.empty((K.shape[0], K.shape[1], 0)) else: # approximate gradient numerically def f(gamma): # helper function return full_multilevel_kernel(XX1, gamma, self.nsamples_per_model) length_scale = np.atleast_1d(self.length_scale) length_scale_gradient = _approx_fprime(length_scale, f, 1e-8) return K, length_scale_gradient
def test_kernel_gradient(kernel): # Compare analytic and numeric gradient of kernels. K, K_gradient = kernel(X, eval_gradient=True) assert_equal(K_gradient.shape[0], X.shape[0]) assert_equal(K_gradient.shape[1], X.shape[0]) assert_equal(K_gradient.shape[2], kernel.theta.shape[0]) def eval_kernel_for_theta(theta): kernel_clone = kernel.clone_with_theta(theta) K = kernel_clone(X, eval_gradient=False) return K K_gradient_approx = \ _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10) assert_almost_equal(K_gradient, K_gradient_approx, 4)
def __call__(self, X, Y=None, eval_gradient=False): X_nn = self._project_manifold(X) if Y is None: K = self.base_kernel(X_nn) if not eval_gradient: return K else: # approximate gradient numerically # XXX: Analytic expression for gradient based on chain rule and # backpropagation? def f(theta): # helper function return self.clone_with_theta(theta)(X, Y) return K, _approx_fprime(self.theta, f, 1e-5) else: if eval_gradient: raise ValueError( "Gradient can only be evaluated when Y is None.") Y_nn = self._project_manifold(Y) return self.base_kernel(X_nn, Y_nn)
def test_fd_kernel_1d(self): num_pts = 4 length_scale = 1 Xf = np.linspace(0., 1., num_pts) Y = Xf.copy() K_ff = kernel_ff(Xf[:, np.newaxis], Xf[:, np.newaxis], length_scale) K_fd_fd = np.zeros_like(K_ff) for ii in range(num_pts): def f(x): Xf[ii] = x[0] return kernel_ff(Xf[:, np.newaxis], Y[:, np.newaxis], length_scale) length_scale_gradient = _approx_fprime([Xf[ii]], f, 1e-8) K_fd_fd += length_scale_gradient.reshape(K_ff.shape) #print length_scale_gradient.reshape(K_ff.shape) K_fd = kernel_fd(Xf[:, np.newaxis], Xf[:, np.newaxis], length_scale, 0) assert np.allclose(K_fd, K_fd_fd)
def test_weighted_white_kernel_gradient(): # Compare analytic and numeric gradient of the kernel: N = 3 X = np.random.RandomState(0).normal(0, 1, (N, 1)) weight = np.exp(np.random.RandomState(0).normal(0, 1, N)) kernel = WeightedWhiteKernel(noise_weight=1. / weight, noise_level=0.1) K, K_gradient = kernel(X, eval_gradient=True) assert_equal(K_gradient.shape[0], X.shape[0]) assert_equal(K_gradient.shape[1], X.shape[0]) assert_equal(K_gradient.shape[2], kernel.theta.shape[0]) def eval_kernel_for_theta(theta): kernel_clone = kernel.clone_with_theta(theta) K = kernel_clone(X, eval_gradient=False) return K K_gradient_approx = \ _approx_fprime(kernel.theta, eval_kernel_for_theta, 1e-10) assert_almost_equal(K_gradient, K_gradient_approx, 4)
def __call__(self, X, Y=None, eval_gradient=False): """Return the kernel k(X, Y) and optionally its gradient. Parameters ---------- X : array, shape (n_samples_X, n_features) Left argument of the returned kernel k(X, Y) Y : array, shape (n_samples_Y, n_features), (optional, default=None) Right argument of the returned kernel k(X, Y). If None, k(X, X) if evaluated instead. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel hyperparameter is determined. Only supported when Y is None. Returns ------- K : array, shape (n_samples_X, n_samples_Y) Kernel k(X, Y) K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims) The gradient of the kernel k(X, X) with respect to the hyperparameter of the kernel. Only returned when eval_gradient is True. """ prototypes_std = self.prototypes.std(0) n_prototypes = self.prototypes.shape[0] n_gradient_dim = \ n_prototypes + (0 if self.hyperparameter_gamma.fixed else 1) X = np.atleast_2d(X) if Y is not None and eval_gradient: raise ValueError("Gradient can only be evaluated when Y is None.") if Y is None: K= np.eye(X.shape[0]) * self.diag(X) if eval_gradient: K_gradient = \ np.zeros((K.shape[0], K.shape[0], n_gradient_dim)) K_pairwise = \ pairwise_kernels(self.prototypes / prototypes_std, X / prototypes_std, metric="rbf", gamma=self.gamma) for i in range(n_prototypes): for j in range(K.shape[0]): K_gradient[j, j, i] = \ self.sigma_2[i] * K_pairwise[i, j] \ / K_pairwise[:, j].sum() if not self.hyperparameter_gamma.fixed: # XXX: Analytic expression for gradient? def f(gamma): # helper function theta = self.theta.copy() theta[-1] = gamma[0] return self.clone_with_theta(theta)(X, Y) K_gradient[:, :, -1] = \ _approx_fprime([self.theta[-1]], f, 1e-5)[:, :, 0] return K, K_gradient else: return K else: K = np.zeros((X.shape[0], Y.shape[0])) return K # XXX: similar entries?
def __call__(self, X, Y=None, eval_gradient=False): """Return the kernel k(X, Y) and optionally its gradient. Parameters ---------- X : array, shape (n_samples_X, n_features) Left argument of the returned kernel k(X, Y) Y : array, shape (n_samples_Y, n_features), (optional, default=None) Right argument of the returned kernel k(X, Y). If None, k(X, X) if evaluated instead. eval_gradient : bool (optional, default=False) Determines whether the gradient with respect to the kernel hyperparameter is determined. Only supported when Y is None. Returns ------- K : array, shape (n_samples_X, n_samples_Y) Kernel k(X, Y) K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims) The gradient of the kernel k(X, X) with respect to the hyperparameter of the kernel. Only returned when eval_gradient is True. """ X_values = X[:, 0].reshape(-1, 1) length_scale = _check_length_scale(X_values, self.length_scale) if Y is None: dists = pdist(X_values / length_scale, metric='euclidean') else: Y_values = Y[:, 0].reshape(-1, 1) if eval_gradient: raise ValueError( "Gradient can only be evaluated when Y is None.") dists = cdist(X_values / length_scale, Y_values / length_scale, metric='euclidean') if self.nu == 0.5: K = np.exp(-dists) elif self.nu == 1.5: K = dists * math.sqrt(3) K = (1. + K) * np.exp(-K) elif self.nu == 2.5: K = dists * math.sqrt(5) K = (1. + K + K**2 / 3.0) * np.exp(-K) else: # general case; expensive to evaluate K = dists K[K == 0.0] += np.finfo(float).eps # strict zeros result in nan tmp = (math.sqrt(2 * self.nu) * K) K.fill((2**(1. - self.nu)) / gamma(self.nu)) K *= tmp**self.nu K *= kv(self.nu, tmp) if Y is None: # convert from upper-triangular matrix to square matrix K = squareform(K) np.fill_diagonal(K, 1) if eval_gradient: if self.hyperparameter_length_scale.fixed: # Hyperparameter l kept fixed K_gradient = np.empty( (X_values.shape[0], X_values.shape[0], 0)) return K, K_gradient # We need to recompute the pairwise dimension-wise distances if self.anisotropic: D = (X_values[:, np.newaxis, :] - X_values[np.newaxis, :, :]) ** 2 \ / (length_scale ** 2) else: D = squareform(dists**2)[:, :, np.newaxis] if self.nu == 0.5: K_gradient = K[..., np.newaxis] * D \ / np.sqrt(D.sum(2))[:, :, np.newaxis] K_gradient[~np.isfinite(K_gradient)] = 0 elif self.nu == 1.5: K_gradient = \ 3 * D * np.exp(-np.sqrt(3 * D.sum(-1)))[..., np.newaxis] elif self.nu == 2.5: tmp = np.sqrt(5 * D.sum(-1))[..., np.newaxis] K_gradient = 5.0 / 3.0 * D * (tmp + 1) * np.exp(-tmp) else: # approximate gradient numerically def f(theta): # helper function return self.clone_with_theta(theta)(X_values, Y_values) return K, _approx_fprime(self.theta, f, 1e-10) if not self.anisotropic: return K, K_gradient[:, :].sum(-1)[:, :, np.newaxis] else: return K, K_gradient else: return K