def get_cond_params(learned_params: dict, x: Array, y: Array, jitter: float = 1e-5) -> dict: params = deepcopy(learned_params) n_samples = x.shape[0] # calculate the cholesky factorization Kuu = rbf_kernel(params["x_u"], params["x_u"], params["variance"], params["length_scale"]) Kuu = add_to_diagonal(Kuu, jitter) Luu = cholesky(Kuu, lower=True) Kuf = rbf_kernel(params["x_u"], x, params["variance"], params["length_scale"]) W = solve_triangular(Luu, Kuf, lower=True) D = np.ones(n_samples) * params["obs_noise"] W_Dinv = W / D K = W_Dinv @ W.T K = add_to_diagonal(K, 1.0) L = cholesky(K, lower=True) # mean function y_residual = y # mean function y_2D = y_residual.reshape(-1, n_samples).T W_Dinv_y = W_Dinv @ y_2D return {"Luu": Luu, "W_Dinv_y": W_Dinv_y, "L": L}
def model(self, batch): XL, XH = batch['XL'], batch['XH'] y = batch['y'] NL, NH = XL.shape[0], XH.shape[0] D = XH.shape[1] # set uninformative log-normal priors for low-fidelity kernel var_L = sample('kernel_var_L', dist.LogNormal(0.0, 1.0), sample_shape = (1,)) length_L = sample('kernel_length_L', dist.LogNormal(0.0, 1.0), sample_shape = (D,)) theta_L = np.concatenate([var_L, length_L]) # set uninformative log-normal priors for high-fidelity kernel var_H = sample('kernel_var_H', dist.LogNormal(0.0, 1.0), sample_shape = (1,)) length_H = sample('kernel_length_H', dist.LogNormal(0.0, 1.0), sample_shape = (D,)) theta_H = np.concatenate([var_H, length_H]) # prior for rho rho = sample('rho', dist.Normal(0.0, 10.0), sample_shape = (1,)) # Compute kernels K_LL = self.kernel(XL, XL, theta_L) + np.eye(NL)*1e-8 K_LH = rho*self.kernel(XL, XH, theta_L) K_HH = rho**2 * self.kernel(XH, XH, theta_L) + \ self.kernel(XH, XH, theta_H) + np.eye(NH)*1e-8 K = np.vstack((np.hstack((K_LL,K_LH)), np.hstack((K_LH.T,K_HH)))) L = cholesky(K, lower=True) # Generate latent function beta_L = sample('beta_L', dist.Normal(0.0, 1.0)) beta_H = sample('beta_H', dist.Normal(0.0, 1.0)) eta_L = sample('eta_L', dist.Normal(0.0, 1.0), sample_shape=(NL,)) eta_H = sample('eta_H', dist.Normal(0.0, 1.0), sample_shape=(NH,)) beta = np.concatenate([beta_L*np.ones(NL), beta_H*np.ones(NH)]) eta = np.concatenate([eta_L, eta_H]) f = np.matmul(L, eta) + beta # Bernoulli likelihood sample('y', dist.Bernoulli(logits=f), obs=y)
def posterior_sample(self, key, sample, X_star, **kwargs): # Fetch training data batch = kwargs['batch'] X = batch['X'] # Fetch params var = sample['kernel_var'] length = sample['kernel_length'] beta = sample['beta'] eta = sample['eta'] theta = np.concatenate([var, length]) # Compute kernels K_xx = self.kernel(X, X, theta) + np.eye(X.shape[0]) * 1e-8 k_pp = self.kernel(X_star, X_star, theta) + np.eye(X_star.shape[0]) * 1e-8 k_pX = self.kernel(X_star, X, theta) L = cholesky(K_xx, lower=True) f = np.matmul(L, eta) + beta tmp_1 = solve_triangular(L.T, solve_triangular(L, f, lower=True)) tmp_2 = solve_triangular(L.T, solve_triangular(L, k_pX.T, lower=True)) # Compute predictive mean mu = np.matmul(k_pX, tmp_1) cov = k_pp - np.matmul(k_pX, tmp_2) std = np.sqrt(np.clip(np.diag(cov), a_min=0.)) sample = mu + std * random.normal(key, mu.shape) return mu, sample
def get_cond_params( kernel, params: dict, x: Array, y: Array, jitter: float = 1e-5 ) -> dict: params = deepcopy(params) x_u = params.pop("x_u") obs_noise = params.pop("obs_noise") kernel = kernel(**params) n_samples = x.shape[0] # calculate the cholesky factorization Luu, W, D = vfe_precompute(x, x_u, obs_noise, kernel, jitter=jitter) W_Dinv = W.T / D K = W_Dinv @ W K = add_to_diagonal(K, 1.0) L = cholesky(K, lower=True) # mean function y_residual = y # mean function y_2D = y_residual.reshape(-1, n_samples).T W_Dinv_y = W_Dinv @ y_2D return { "X": x, "y": y, "Luu": Luu, "L": L, "W_Dinv_y": W_Dinv_y, "x_u": x_u, "kernel_params": params, "obs_noise": obs_noise, "kernel": kernel, }
def _gaussian_kernel_eval(in_log, points, values, xi, precision): points, values, xi, precision = _promote_dtypes_inexact( points, values, xi, precision) d = points.shape[1] if xi.shape[1] != d: raise ValueError("points and xi must have same trailing dim") if precision.shape != (d, d): raise ValueError("precision matrix must match data dims") whitening = linalg.cholesky(precision, lower=True) points = jnp.dot(points, whitening) xi = jnp.dot(xi, whitening) log_norm = jnp.sum(jnp.log( jnp.diag(whitening))) - 0.5 * d * jnp.log(2 * np.pi) def kernel(x_test, x_train, y_train): arg = log_norm - 0.5 * jnp.sum(jnp.square(x_train - x_test)) if in_log: return jnp.log(y_train) + arg else: return y_train * jnp.exp(arg) reduce = special.logsumexp if in_log else jnp.sum reduced_kernel = lambda x: reduce(vmap(kernel, in_axes=(None, 0, 0)) (x, points, values), axis=0) mapped_kernel = vmap(reduced_kernel) return mapped_kernel(xi)
def variational_expectation(self, y, post_mean, post_cov, cubature=None): """ """ num_components = int(post_mean.shape[0] / 2) if cubature is None: x, w = gauss_hermite(num_components, 20) # Gauss-Hermite sigma points and weights else: x, w = cubature(num_components) # subband_mean, modulator_mean = post_mean[:num_components], self.link_fn(post_mean[num_components:]) subband_mean, modulator_mean = post_mean[:num_components], post_mean[num_components:] # TODO: CHECK subband_cov, modulator_cov = post_cov[:num_components, :num_components], post_cov[num_components:, num_components:] sigma_points = cholesky(modulator_cov) @ x + modulator_mean modulator_var = np.diag(subband_cov)[..., None] mu = (self.link_fn(sigma_points).T @ subband_mean)[:, 0] lognormpdf = -0.5 * np.log(2 * np.pi * self.variance) - 0.5 * (y - mu) ** 2 / self.variance const = -0.5 / self.variance * (self.link_fn(sigma_points).T ** 2 @ modulator_var)[:, 0] exp_log_lik = np.sum(w * (lognormpdf + const)) dE1 = np.sum(w * self.link_fn(sigma_points) * (y - mu) / self.variance, axis=-1) dE2 = np.sum(w * (sigma_points - modulator_mean) * modulator_var ** -1 * (lognormpdf + const), axis=-1) dE_dm = np.block([dE1, dE2])[..., None] d2E1 = np.sum(w * - 0.5 * self.link_fn(sigma_points) ** 2 / self.variance, axis=-1) d2E2 = np.sum(w * 0.5 * ( ((sigma_points - modulator_mean) * modulator_var ** -1) ** 2 - modulator_var ** -1 ) * (lognormpdf + const), axis=-1) dE_dv = np.diag(np.block([d2E1, d2E2])) return exp_log_lik, dE_dm, dE_dv
def mll(ds: Dataset): x, y = ds.X, ds.y params = {} for iname, iparam in numpyro_params.items(): if iparam["param_type"] == "prior": params[iname] = numpyro.sample(name=iname, fn=iparam["prior"]) else: params[iname] = numpyro.param( name=iname, init_value=iparam["init_value"], constraint=iparam["constraint"], ) # get mean function mu = gp.prior.mean_function(x) # covariance function gram_matrix = gram(gp.prior.kernel, x, params) gram_matrix += params["obs_noise"] * I(x.shape[0]) # scale triangular matrix L = cholesky(gram_matrix, lower=True) return numpyro.sample( "y", dist.MultivariateNormal(loc=mu, scale_tril=L), obs=y.squeeze(), )
def GP(X, y): X = numpyro.deterministic("X", X) # Set informative priors on kernel hyperparameters. η = numpyro.sample("variance", dist.HalfCauchy(scale=5.0)) ℓ = numpyro.sample("length_scale", dist.Gamma(2.0, 1.0)) σ = numpyro.sample("obs_noise", dist.HalfCauchy(scale=5.0)) # Compute kernel K = rbf_kernel(X, X, η, ℓ) K = add_to_diagonal(K, σ) K = add_to_diagonal(K, wandb.config.jitter) # cholesky decomposition Lff = numpyro.deterministic("Lff", cholesky(K, lower=True)) # Sample y according to the standard gaussian process formula return numpyro.sample( "y", dist.MultivariateNormal(loc=jnp.zeros(X.shape[0]), scale_tril=Lff).expand_by( y.shape[:-1]) # for multioutput scenarios .to_event(y.ndim - 1), obs=y, )
def _multivariate_normal(key, mean, cov, shape, dtype): """Sample multivariate normal random values with given shape, mean, and covariance. Args: key: a PRNGKey used as the random key. mean: optional, a scalar or array of mean values along each dimension cov: optional, a scalar (isotropic), vector (diagonal covariance matrix), or full covariance matrix shape: optional, a tuple of nonnegative integers representing the shape. Returns: A random array with latent dimension of (max(asarray(mean).ndim, asarray(cov).ndim)),) """ _check_shape("multivariate_normal", shape) if hasattr(mean, "shape") and mean.ndim > 1: raise ValueError("Mean cannot have more than 1 dimension.") if hasattr(cov, "shape") and cov.ndim > 0: if cov.ndim > 2: raise ValueError( "Covariance matrix cannot have more than 2 dimensions.") shape = shape + cov.shape[:1] normal_samples = normal(key, shape, dtype) if cov.ndim == 2: samples = np.tensordot(normal_samples, cholesky(cov), axes=1) else: samples = normal_samples * np.sqrt(cov) else: if hasattr(mean, "shape") and mean.ndim > 0: shape = shape + mean.shape[:1] normal_samples = normal(key, shape, dtype) samples = np.sqrt(cov) * normal_samples return samples + mean
def evaluate(self): K = self.model.kernel.function(self.model.X, self.model.parameters)\ + jnp.eye(self.N) * (self.model.parameters["noise"] + 1e-8) self.L = cholesky(K, lower=True) self.alpha = solve_triangular( self.L.T, solve_triangular(self.L, self.model.y, lower=True))
def observation_model(self, f, sigma): """ The implicit observation model is: h(fₙ,rₙ) = E[yₙ|fₙ] + √Cov[yₙ|fₙ] σₙ """ conditional_expectation, conditional_covariance = self.conditional_moments(f) obs_model = conditional_expectation + cholesky(conditional_covariance) @ sigma return np.squeeze(obs_model)
def precompute(X, obs_noise, kernel, jitter): # Kernel Kff = kernel.gram(X) Kff = add_to_diagonal(Kff, obs_noise) Kff = add_to_diagonal(Kff, jitter) Lff = cholesky(Kff, lower=True) return Lff
def observation_model(self, f, sigma): """ TODO: sort out broadcasting so we don't need this additional function (only difference is the transpose) The implicit observation model is: h(fₙ,rₙ) = E[yₙ|fₙ] + √Cov[yₙ|fₙ] σₙ """ conditional_expectation, conditional_covariance = self.conditional_moments(f) obs_model = conditional_expectation + cholesky(conditional_covariance.T) @ sigma return np.squeeze(obs_model)
def moment_match(self, y, cav_mean, cav_cov, hyp=None, power=1.0, cubature_func=None): """ """ num_components = int(cav_mean.shape[0] / 2) if cubature_func is None: x, w = gauss_hermite(num_components, 20) # Gauss-Hermite sigma points and weights else: x, w = cubature_func(num_components) subband_mean, modulator_mean = cav_mean[:num_components], self.link_fn( cav_mean[num_components:]) subband_cov, modulator_cov = cav_cov[:num_components, : num_components], cav_cov[ num_components:, num_components:] sigma_points = cholesky(modulator_cov) @ x + modulator_mean const = power**-0.5 * (2 * pi * hyp)**(0.5 - 0.5 * power) mu = (self.link_fn(sigma_points).T @ subband_mean)[:, 0] var = hyp / power + (self.link_fn(sigma_points).T**2 @ np.diag(subband_cov)[..., None])[:, 0] normpdf = const * (2 * pi * var)**-0.5 * np.exp(-0.5 * (y - mu)**2 / var) Z = np.sum(w * normpdf) Zinv = 1. / (Z + 1e-8) lZ = np.log(Z + 1e-8) dZ1 = np.sum(w * self.link_fn(sigma_points) * (y - mu) / var * normpdf, axis=-1) dZ2 = np.sum(w * (sigma_points - modulator_mean) * np.diag(modulator_cov)[..., None]**-1 * normpdf, axis=-1) dlZ = Zinv * np.block([dZ1, dZ2]) d2Z1 = np.sum(w * self.link_fn(sigma_points)**2 * (((y - mu) / var)**2 - var**-1) * normpdf, axis=-1) d2Z2 = np.sum(w * (((sigma_points - modulator_mean) * np.diag(modulator_cov)[..., None]**-1)**2 - np.diag(modulator_cov)[..., None]**-1) * normpdf, axis=-1) d2lZ = np.diag(-dlZ**2 + Zinv * np.block([d2Z1, d2Z2])) id2lZ = inv( ensure_positive_precision(-d2lZ) - 1e-10 * np.eye(d2lZ.shape[0])) site_mean = cav_mean + id2lZ @ dlZ[ ..., None] # approx. likelihood (site) mean (see Rasmussen & Williams p75) site_cov = power * (-cav_cov + id2lZ ) # approx. likelihood (site) variance return lZ, site_mean, site_cov
def compute_cholesky(self, params, batch): X = batch['X'] N, D = X.shape # Fetch params sigma_n = params[-1] theta = params[:-1] # Compute kernel K = self.kernel(X, X, theta) + np.eye(N) * (sigma_n + 1e-8) L = cholesky(K, lower=True) return L
def vfe_precompute(X, X_u, obs_noise, kernel, jitter: float = 1e-5): # Kernel Kuu = kernel.gram(X_u) Kuu = add_to_diagonal(Kuu, jitter) Luu = cholesky(Kuu, lower=True) Kuf = kernel.cross_covariance(X_u, X) # calculate cholesky Luu = cholesky(Kuu, lower=True) # compute W W = solve_triangular(Luu, Kuf, lower=True).T # compute D D = jnp.ones(Kuf.shape[1]) * obs_noise return Luu, W, D
def fitc_precompute(X, X_u, obs_noise, kernel, jitter: float = 1e-5): # Kernel Kuu = kernel.gram(X_u) Kuu = add_to_diagonal(Kuu, jitter) Luu = cholesky(Kuu, lower=True) Kuf = kernel.cross_covariance(X_u, X) # calculate cholesky Luu = cholesky(Kuu, lower=True) # compute W W = solve_triangular(Luu, Kuf, lower=True).T Kffdiag = kernel.diag(X) Qffdiag = jnp.power(W, 2).sum(axis=1) D = Kffdiag - Qffdiag + obs_noise return Luu, W, D
def variational_expectation_cubature(self, y, post_mean, post_cov, hyp=None, cubature_func=None): """ Computes the "variational expectation" via cubature, i.e. the expected log-likelihood, and its derivatives w.r.t. the posterior mean E[log p(yₙ|fₙ)] = ∫ log p(yₙ|fₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ with EP power a. :param y: observed data (yₙ) [scalar] :param post_mean: posterior mean (mₙ) [scalar] :param post_cov: posterior variance (vₙ) [scalar] :param hyp: likelihood hyperparameter [scalar] :param cubature_func: the function to compute sigma points and weights to use during cubature :return: exp_log_lik: the expected log likelihood, E[log p(yₙ|fₙ)] [scalar] dE_dm: derivative of E[log p(yₙ|fₙ)] w.r.t. mₙ [scalar] dE_dv: derivative of E[log p(yₙ|fₙ)] w.r.t. vₙ [scalar] """ if cubature_func is None: x, w = gauss_hermite(post_mean.shape[0], 20) # Gauss-Hermite sigma points and weights else: x, w = cubature_func(post_mean.shape[0]) # fsigᵢ=xᵢ√(vₙ) + mₙ: scale locations according to cavity dist. sigma_points = cholesky(post_cov) @ np.atleast_2d(x) + post_mean # pre-compute wᵢ log p(yₙ|xᵢ√(2vₙ) + mₙ) weighted_log_likelihood_eval = w * self.evaluate_log_likelihood( y, sigma_points, hyp) # Compute expected log likelihood via cubature: # E[log p(yₙ|fₙ)] = ∫ log p(yₙ|fₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ p(yₙ|fsigᵢ) exp_log_lik = np.sum(weighted_log_likelihood_eval) # Compute first derivative via cubature: # dE[log p(yₙ|fₙ)]/dmₙ = ∫ (fₙ-mₙ) vₙ⁻¹ log p(yₙ|fₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ (fₙ-mₙ) vₙ⁻¹ log p(yₙ|fsigᵢ) invv = np.diag(post_cov)[:, None]**-1 dE_dm = np.sum(invv * (sigma_points - post_mean) * weighted_log_likelihood_eval, axis=-1)[:, None] # Compute second derivative via cubature (deriv. w.r.t. var = 0.5 * 2nd deriv. w.r.t. mean): # dE[log p(yₙ|fₙ)]/dvₙ = ∫ [(fₙ-mₙ)² vₙ⁻² - vₙ⁻¹]/2 log p(yₙ|fₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ [(fₙ-mₙ)² vₙ⁻² - vₙ⁻¹]/2 log p(yₙ|fsigᵢ) dE_dv = np.sum( (0.5 * (invv**2 * (sigma_points - post_mean)**2) - 0.5 * invv) * weighted_log_likelihood_eval, axis=-1) dE_dv = np.diag(dE_dv) return exp_log_lik, dE_dm, dE_dv
def log_likelihood(self, params): self.model.set_parameters(params) kx = self.model.kernel.function( self.model.X, params) + jnp.eye(self.N) * (params["noise"] + 1e-8) L = cholesky(kx, lower=True) alpha = solve_triangular(L.T, solve_triangular(L, self.model.y, lower=True)) W_logdet = 2. * jnp.sum(jnp.log(jnp.diag(L))) log_marginal = 0.5 * (-self.model.y.size * log_2_pi - self.model.y.shape[1] * W_logdet - jnp.sum(alpha * self.model.y)) return log_marginal
def statistical_linear_regression(self, cav_mean, cav_cov, hyp=None, cubature_func=None): """ This gives the same result as above - delete """ num_components = int(cav_mean.shape[0] / 2) if cubature_func is None: x, w = gauss_hermite(num_components, 20) # Gauss-Hermite sigma points and weights else: x, w = cubature_func(num_components) subband_mean, modulator_mean = cav_mean[:num_components], self.link_fn( cav_mean[num_components:]) subband_cov, modulator_cov = cav_cov[:num_components, : num_components], cav_cov[ num_components:, num_components:] sigma_points = cholesky(modulator_cov) @ x + modulator_mean lik_expectation, lik_covariance = ( self.link_fn(sigma_points).T @ subband_mean).T, hyp # Compute zₙ via cubature: # muₙ = ∫ E[yₙ|fₙ] 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ E[yₙ|fsigᵢ] mu = np.sum(w * lik_expectation, axis=-1)[:, None] # Compute variance S via cubature: # S = ∫ [(E[yₙ|fₙ]-zₙ) (E[yₙ|fₙ]-zₙ)' + Cov[yₙ|fₙ]] 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ [(E[yₙ|fsigᵢ]-zₙ) (E[yₙ|fsigᵢ]-zₙ)' + Cov[yₙ|fₙ]] S = np.sum(w * ((lik_expectation - mu) * (lik_expectation - mu) + lik_covariance), axis=-1)[:, None] # Compute cross covariance C via cubature: # C = ∫ (fₙ-mₙ) (E[yₙ|fₙ]-zₙ)' 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ (fsigᵢ -mₙ) (E[yₙ|fsigᵢ]-zₙ)' C = np.sum(w * np.block([[ self.link_fn(sigma_points) * np.diag(subband_cov)[..., None] ], [sigma_points - modulator_mean]]) * (lik_expectation - mu), axis=-1)[:, None] # Compute derivative of mu via cubature: omega = np.sum( w * np.block([[self.link_fn(sigma_points)], [ np.diag(modulator_cov)[..., None]**-1 * (sigma_points - modulator_mean) * lik_expectation ]]), axis=-1)[None, :] return mu, S, C, omega
def expected_log_likelihood(self, y, m, v, cubature=None): """ """ if cubature is None: x, w = gauss_hermite(2, 20) # Gauss-Hermite sigma points and weights else: x, w = cubature(2) v = (v + v.T) / 2 sigma_points = cholesky(v) @ x + m # fsigᵢ=xᵢ√(2vₙ) + mₙ: scale locations according to cavity dist. # Compute expected log likelihood via cubature: # E[log p(yₙ|fₙ)] = ∫ log p(yₙ|fₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ log p(yₙ|fsigᵢ) exp_log_lik = np.sum(w * self.evaluate_log_likelihood(y, sigma_points)) return exp_log_lik
def model(self, batch): X = batch['X'] y = batch['y'] N, D = X.shape # set uninformative log-normal priors var = sample('kernel_var', dist.LogNormal(0.0, 1.0), sample_shape = (1,)) length = sample('kernel_length', dist.LogNormal(0.0, 1.0), sample_shape = (D,)) theta = np.concatenate([var, length]) # compute kernel K = self.kernel(X, X, theta) + np.eye(N)*1e-8 L = cholesky(K, lower=True) # Generate latent function beta = sample('beta', dist.Normal(0.0, 1.0)) eta = sample('eta', dist.Normal(0.0, 1.0), sample_shape=(N,)) f = np.matmul(L, eta) + beta # Bernoulli likelihood sample('y', dist.Bernoulli(logits=f), obs=y)
def posterior_sample(self, key, sample, X_star, **kwargs): # Fetch training data batch = kwargs['batch'] XL, XH = batch['XL'], batch['XH'] NL, NH = XL.shape[0], XH.shape[0] # Fetch params var_L = sample['kernel_var_L'] var_H = sample['kernel_var_H'] length_L = sample['kernel_length_L'] length_H = sample['kernel_length_H'] beta_L = sample['beta_L'] beta_H = sample['beta_H'] eta_L = sample['eta_L'] eta_H = sample['eta_H'] rho = sample['rho'] theta_L = np.concatenate([var_L, length_L]) theta_H = np.concatenate([var_H, length_H]) beta = np.concatenate([beta_L*np.ones(NL), beta_H*np.ones(NH)]) eta = np.concatenate([eta_L, eta_H]) # Compute kernels k_pp = rho**2 * self.kernel(X_star, X_star, theta_L) + \ self.kernel(X_star, X_star, theta_H) + \ np.eye(X_star.shape[0])*1e-8 psi1 = rho*self.kernel(X_star, XL, theta_L) psi2 = rho**2 * self.kernel(X_star, XH, theta_L) + \ self.kernel(X_star, XH, theta_H) k_pX = np.hstack((psi1,psi2)) # Compute K_xx K_LL = self.kernel(XL, XL, theta_L) + np.eye(NL)*1e-8 K_LH = rho*self.kernel(XL, XH, theta_L) K_HH = rho**2 * self.kernel(XH, XH, theta_L) + \ self.kernel(XH, XH, theta_H) + np.eye(NH)*1e-8 K_xx = np.vstack((np.hstack((K_LL,K_LH)), np.hstack((K_LH.T,K_HH)))) L = cholesky(K_xx, lower=True) # Sample latent function f = np.matmul(L, eta) + beta tmp_1 = solve_triangular(L.T,solve_triangular(L, f, lower=True)) tmp_2 = solve_triangular(L.T,solve_triangular(L, k_pX.T, lower=True)) # Compute predictive mean mu = np.matmul(k_pX, tmp_1) cov = k_pp - np.matmul(k_pX, tmp_2) std = np.sqrt(np.clip(np.diag(cov), a_min=0.)) sample = mu + std * random.normal(key, mu.shape) return mu, sample
def _build_prior(self, Xs, **kwargs): self.N = np.prod([len(X) for X in Xs]) mu = self.mean_func(cartesian(Xs)) chols = [] for i, (cov, X) in enumerate(zip(self.cov_funcs, Xs)): Kxx = npy.deterministic(f"{self.name}_Kxx_{i}", cov(X)) chol = cholesky(stabilize(Kxx), lower=True) chols.append(chol) # remove reparameterization option v = npy.sample( f"{self.name}_rotated", dist.Normal(loc=jnp.zeros(self.N), scale=jnp.ones(self.N), **kwargs), ) f = npy.deterministic(self.name, mu + (kron_dot(chols, v)).reshape(-1)) return f
def statistical_linear_regression_cubature(self, cav_mean, cav_cov, hyp=None, cubature_func=None): """ Perform statistical linear regression (SLR) using cubature. We aim to find a likelihood approximation p(yₙ|fₙ) ≈ 𝓝(yₙ|Afₙ+b,Ω+Var[yₙ|fₙ]). TODO: this currently assumes an additive noise model (ok for our current applications), make more general """ if cubature_func is None: x, w = gauss_hermite(cav_mean.shape[0], 20) # Gauss-Hermite sigma points and weights else: x, w = cubature_func(cav_mean.shape[0]) # fsigᵢ=xᵢ√(vₙ) + mₙ: scale locations according to cavity dist. sigma_points = cholesky(cav_cov) @ np.atleast_2d(x) + cav_mean lik_expectation, lik_covariance = self.conditional_moments( sigma_points, hyp) # Compute zₙ via cubature: # zₙ = ∫ E[yₙ|fₙ] 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ E[yₙ|fsigᵢ] mu = np.sum(w * lik_expectation, axis=-1)[:, None] # Compute variance S via cubature: # S = ∫ [(E[yₙ|fₙ]-zₙ) (E[yₙ|fₙ]-zₙ)' + Cov[yₙ|fₙ]] 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ [(E[yₙ|fsigᵢ]-zₙ) (E[yₙ|fsigᵢ]-zₙ)' + Cov[yₙ|fₙ]] # TODO: allow for multi-dim cubature S = np.sum(w * ((lik_expectation - mu) * (lik_expectation - mu) + lik_covariance), axis=-1)[:, None] # Compute cross covariance C via cubature: # C = ∫ (fₙ-mₙ) (E[yₙ|fₙ]-zₙ)' 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ (fsigᵢ -mₙ) (E[yₙ|fsigᵢ]-zₙ)' C = np.sum(w * (sigma_points - cav_mean) * (lik_expectation - mu), axis=-1)[:, None] # Compute derivative of z via cubature: # omega = ∫ E[yₙ|fₙ] vₙ⁻¹ (fₙ-mₙ) 𝓝(fₙ|mₙ,vₙ) dfₙ # ≈ ∑ᵢ wᵢ E[yₙ|fsigᵢ] vₙ⁻¹ (fsigᵢ-mₙ) omega = np.sum(w * lik_expectation * (inv(cav_cov) @ (sigma_points - cav_mean)), axis=-1)[None, :] return mu, S, C, omega
def log_density(self, y, mean, cov, cubature=None): """ """ num_components = int(mean.shape[0] / 2) if cubature is None: x, w = gauss_hermite(num_components, 20) # Gauss-Hermite sigma points and weights else: x, w = cubature(num_components) # subband_mean, modulator_mean = mean[:num_components], self.link_fn(mean[num_components:]) subband_mean, modulator_mean = mean[:num_components], mean[num_components:] # TODO: CHECK subband_cov, modulator_cov = cov[:num_components, :num_components], cov[num_components:, num_components:] sigma_points = cholesky(modulator_cov) @ x + modulator_mean mu = (self.link_fn(sigma_points).T @ subband_mean)[:, 0] var = self.variance + (self.link_fn(sigma_points).T ** 2 @ np.diag(subband_cov)[..., None])[:, 0] normpdf = (2 * np.pi * var) ** -0.5 * np.exp(-0.5 * (y - mu) ** 2 / var) Z = np.sum(w * normpdf) lZ = np.log(Z + 1e-8) return lZ
def _multivariate_normal(key, mean, cov, shape, dtype): if not onp.ndim(mean) >= 1: msg = "multivariate_normal requires mean.ndim >= 1, got mean.ndim == {}" raise ValueError(msg.format(onp.ndim(mean))) if not onp.ndim(cov) >= 2: msg = "multivariate_normal requires cov.ndim >= 2, got cov.ndim == {}" raise ValueError(msg.format(onp.ndim(cov))) n = mean.shape[-1] if onp.shape(cov)[-2:] != (n, n): msg = ("multivariate_normal requires cov.shape == (..., n, n) for n={n}, " "but got cov.shape == {shape}.") raise ValueError(msg.format(n=n, shape=onp.shape(cov))) if shape is None: shape = lax.broadcast_shapes(mean.shape[:-1], cov.shape[:-2]) else: _check_shape("normal", shape, mean.shape[:-1], mean.shape[:-2]) chol_factor = cholesky(cov) normal_samples = normal(key, shape + mean.shape[-1:], dtype) return mean + np.tensordot(normal_samples, chol_factor, [-1, 1])
def _newton_iteration(y_train, K, f): pi = expit(f) W = pi * (1 - pi) # Line 5 W_sr = np.sqrt(W) W_sr_K = W_sr[:, np.newaxis] * K B = np.eye(W.shape[0]) + W_sr_K * W_sr L = cholesky(B, lower=True) # Line 6 b = W * f + (y_train - pi) # Line 7 a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b)) # Line 8 f = K.dot(a) # Line 10: Compute log marginal likelihood in loop and use as # convergence criterion lml = -0.5 * a.T.dot(f) \ - np.log1p(np.exp(-(y_train * 2 - 1) * f)).sum() \ - np.log(np.diag(L)).sum() return lml, f, (pi, W_sr, L, b, a)
def _conditional(self, params, *args, **kwargs): delta = params[self.gp].squeeze() - params[self.mean].squeeze() chols = [ cholesky(stabilize(params[Kxx].squeeze()), lower=True) for _, Kxx in sorted(self.Kxx.items()) ] cholTs = [chol.T for chol in chols] Kss = params[self.Kss].squeeze() Ksx = params[self.Ksx].squeeze() Kxs = Ksx.T alpha = kron_solve_lower(chols, delta) alpha = kron_solve_upper(cholTs, alpha) mu = jnp.dot(Ksx, alpha).ravel() + params[self.cond].squeeze() A = kron_solve_lower(chols, Kxs) cov = stabilize(Kss - jnp.dot(A.T, A)) return mu, cov
def _build_conditional(self, params, pred_noise=False, diag=False): Kxx = params[self.Kxx].squeeze() + params[self.Knx].squeeze() Kxs = params[self.Ksx].T.squeeze() Knx = params[self.Knx].squeeze() rxx = params[self.y].squeeze() - params[self.mean].squeeze() L = cholesky(stabilize(Kxx) + Knx, lower=True) A = solve_lower(L, Kxs) v = solve_lower(L, rxx) mu = params[self.cond].squeeze() + A.T @ v if diag: Kss = jnp.diag(jnp.diag(params[self.Kss].squeeze())) var = Kss - jnp.sum(jnp.square(A), 0) if pred_noise: var += jnp.diag(jnp.diag(params[self.Kns].squeeze())) return mu, var else: Kss = params[self.Kss].squeeze() cov = Kss - A.T @ A if pred_noise: cov += params[self.Kns].squeeze() return mu, cov if pred_noise else stabilize(cov)