def __init__(self, train_x, train_y, likelihood,kernel = 'rbf',nu = 2.5): super(GPModel, self).__init__(train_x, train_y, likelihood) grid_size = gpytorch.utils.grid.choose_grid_size(train_x) self.mean_module = gpytorch.means.ConstantMean() if kernel =='rbf': self.covar_module = SKI( ScaleKernel( RBFKernel(ard_num_dims=2) ), grid_size=grid_size, num_dims=2 ) elif kernel == 'matern': self.covar_module = SKI( ScaleKernel( MaternKernel(nu) ), grid_size=grid_size, num_dims=2 )
def test_solve_vector(self): size = 100 train_x = torch.linspace(0, 1, size) covar_matrix = RBFKernel()(train_x, train_x).evaluate() piv_chol = pivoted_cholesky.pivoted_cholesky(covar_matrix, 10) woodbury_factor = pivoted_cholesky.woodbury_factor( piv_chol, torch.ones(100)) rhs_vector = torch.randn(100) shifted_covar_matrix = covar_matrix + torch.eye(size) real_solve = shifted_covar_matrix.inverse().matmul(rhs_vector) approx_solve = pivoted_cholesky.woodbury_solve(rhs_vector, piv_chol, woodbury_factor, torch.ones(100)) self.assertTrue(approx_equal(approx_solve, real_solve, 2e-4))
def __init__(self, inducting_points): ''' As a default, we'll use the default VariationalStrategy class with a CholeskyVariationalDistribution. The CholeskyVariationalDistribution class allows S to be on any positive semidefinite matrix. This is the most general/expressive option for approximate GPs ''' variational_distribution = CholeskyVariationalDistribution( inducting_points.size(-2)) variational_strategy = VariationalStrategy( self, inducting_points, variational_distribution, learn_inducing_locations=True) super().__init__(variational_strategy) self.mean = ConstantMean() self.covar = ScaleKernel(RBFKernel())
def __init__(self, inducting_points): ''' A more extreme method of reducing parameters is to get rid of S entirely. This corresponds to learning a delta distribution u=m rather than a multivariate Normal distribution for u. In other words, this corresponds to performing MAP estimation rather than variational inference. ''' variational_distribution = DeltaVariationalDistribution( inducting_points.size(-2)) variational_strategy = VariationalStrategy( self, inducting_points, variational_distribution, learn_inducing_locations=True) super().__init__(variational_strategy) self.mean = ConstantMean() self.covar = ScaleKernel(RBFKernel())
def __init__(self, train_inputs, train_targets, likelihood, batch_size=1): super(ExactGPModel, self).__init__(train_inputs, train_targets, likelihood) self.mean_module = ConstantMean(batch_size=batch_size, prior=gpytorch.priors.SmoothedBoxPrior( -1, 1)) self.covar_module = ScaleKernel( RBFKernel( batch_size=batch_size, lengthscale_prior=gpytorch.priors.NormalPrior( loc=torch.zeros(batch_size, 1, 1), scale=torch.ones(batch_size, 1, 1)), ), batch_size=batch_size, outputscale_prior=gpytorch.priors.SmoothedBoxPrior(-2, 2), )
def test_subset_active_compute_radial_basis_function(self): a = torch.Tensor([4, 2, 8]).view(3, 1) a_p = torch.Tensor([1, 2, 3]).view(3, 1) a = torch.cat((a, a_p), 1) b = torch.Tensor([0, 2]).view(2, 1) lengthscale = 2 kernel = RBFKernel(active_dims=[0]) kernel.initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() actual = torch.Tensor([[16, 4], [4, 0], [64, 36]]).mul_(-0.5).div_(lengthscale**2).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5)
def __init__(self, grid_size=6, grid_bounds=[(-0.33, 1.33), (-0.33, 1.33)]): variational_distribution = gpytorch.variational.CholeskyVariationalDistribution( num_inducing_points=int(pow(grid_size, len(grid_bounds)))) variational_strategy = gpytorch.variational.GridInterpolationVariationalStrategy( self, grid_size=grid_size, grid_bounds=grid_bounds, variational_distribution=variational_distribution) super(GPClassificationModel, self).__init__(variational_strategy) self.mean_module = ConstantMean(prior=SmoothedBoxPrior(-1e-5, 1e-5)) self.covar_module = ScaleKernel( RBFKernel(ard_num_dims=2, lengthscale_prior=SmoothedBoxPrior(exp(-2.5), exp(3), sigma=0.1)))
def test_computes_radial_basis_function_gradient(): a = torch.Tensor([4, 2, 8]).view(3, 1) b = torch.Tensor([0, 2, 2]).view(3, 1) lengthscale = 2 kernel = RBFKernel().initialize(log_lengthscale=math.log(lengthscale)) param = Variable(torch.Tensor(3, 3).fill_(math.log(lengthscale)), requires_grad=True) diffs = Variable(a.expand(3, 3) - b.expand(3, 3).transpose(0, 1)) actual_output = (-(diffs**2) / param.exp()).exp() actual_output.backward(torch.eye(3)) actual_param_grad = param.grad.data.sum() output = kernel(Variable(a), Variable(b)) output.backward(gradient=torch.eye(3)) res = kernel.log_lengthscale.grad.data assert (torch.norm(res - actual_param_grad) < 1e-5)
def test_computes_radial_basis_function(): a = torch.Tensor([4, 2, 8]).view(3, 1) b = torch.Tensor([0, 2, 2]).view(3, 1) lengthscale = 2 kernel = RBFKernel() actual = torch.Tensor([ [16, 4, 4], [4, 0, 0], [64, 36, 36], ]).mul_(-1).div_(lengthscale).exp() res = kernel(Variable(a), Variable(b), log_lengthscale=Variable(torch.Tensor([math.log(lengthscale) ]))).data assert (torch.norm(res - actual) < 1e-5)
def __init__(self, grid_size=16, grid_bounds=([-1, 1],)): variational_distribution = CholeskyVariationalDistribution( num_inducing_points=16, batch_shape=torch.Size([2]) ) variational_strategy = AdditiveGridInterpolationVariationalStrategy( self, grid_size=grid_size, grid_bounds=grid_bounds, num_dim=2, variational_distribution=variational_distribution, ) super(GPClassificationModel, self).__init__(variational_strategy) self.mean_module = ConstantMean(prior=SmoothedBoxPrior(-1e-5, 1e-5)) self.covar_module = ScaleKernel( RBFKernel(ard_num_dims=1, lengthscale_prior=SmoothedBoxPrior(exp(-5), exp(6), sigma=0.1)), outputscale_prior=SmoothedBoxPrior(exp(-5), exp(6), sigma=0.1), )
def __init__(self, inducting_points): ''' One way to reduce the number of parameters is to restrict that $\mathbf S$ is only diagonal. This is less expressive, but the number of parameters is now linear in $m$ instead of quadratic. All we have to do is take the previous example, and change CholeskyVariationalDistribution S to MeanFieldVariationalDistribution S. ''' variational_distribution = MeanFieldVariationalDistribution( inducting_points.size(-2)) variational_strategy = VariationalStrategy( self, inducting_points, variational_distribution, learn_inducing_locations=True) super().__init__(variational_strategy) self.mean = ConstantMean() self.covar = ScaleKernel(RBFKernel())
def test_computes_radial_basis_function(self): a = torch.tensor([4, 2, 8], dtype=torch.float).view(3, 1) b = torch.tensor([0, 2, 4], dtype=torch.float).view(3, 1) lengthscale = 2 kernel = RBFKernel().initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() actual = torch.tensor([[16, 4, 0], [4, 0, 4], [64, 36, 16]], dtype=torch.float) actual.mul_(-0.5).div_(lengthscale ** 2).exp_() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = actual.diag() self.assertLess(torch.norm(res - actual), 1e-5)
def test_solve(self): size = 100 train_x = torch.linspace(0, 1, size) covar_matrix = RBFKernel()(train_x, train_x).evaluate() piv_chol = pivoted_cholesky.pivoted_cholesky(covar_matrix, 10) woodbury_factor, inv_scale, logdet = woodbury.woodbury_factor(piv_chol, piv_chol, torch.ones(100), logdet=True) self.assertTrue(approx_equal(logdet, (piv_chol @ piv_chol.transpose(-1, -2) + torch.eye(100)).logdet(), 2e-4)) rhs_vector = torch.randn(100, 50) shifted_covar_matrix = covar_matrix + torch.eye(size) real_solve = shifted_covar_matrix.inverse().matmul(rhs_vector) scaled_inv_diag = (inv_scale / torch.ones(100)).unsqueeze(-1) approx_solve = woodbury.woodbury_solve( rhs_vector, piv_chol * scaled_inv_diag, woodbury_factor, scaled_inv_diag, inv_scale ) self.assertTrue(approx_equal(approx_solve, real_solve, 2e-4))
def __init__(self, input_dims, output_dims, num_inducing=128, mean_type='constant'): if output_dims is None: inducing_points = torch.randn(num_inducing, input_dims) batch_shape = torch.Size([]) else: inducing_points = torch.randn(output_dims, num_inducing, input_dims) batch_shape = torch.Size([output_dims]) variational_distribution = CholeskyVariationalDistribution( num_inducing_points=num_inducing, batch_shape=batch_shape) variational_strategy = VariationalStrategy( self, inducing_points, variational_distribution, learn_inducing_locations=True) super(DGPHiddenLayer, self).__init__(variational_strategy, input_dims, output_dims) if mean_type == 'constant': self.mean_module = ConstantMean(batch_shape=batch_shape) else: # (if 'linear') self.mean_module = LinearMean(input_dims) #lengthscale_constraint = gpytorch.constraints.Interval(0.0001, 10.0) # needs to be floats lengthscale_prior = gpytorch.priors.NormalPrior(0.1, 2.0) outputscale_prior = gpytorch.priors.NormalPrior(1.0, 3.0) lengthscale_constraint = None #lengthscale_prior = None self.covar_module = ScaleKernel( RBFKernel( batch_shape=batch_shape, ard_num_dims=input_dims, #active_dims=(0), lengthscale_constraint=lengthscale_constraint, lengthscale_prior=lengthscale_prior), outputscale_prior=outputscale_prior, batch_shape=batch_shape, ard_num_dims=input_dims)
def __init__(self, input_size, device='cpu'): if device == 'gpu' and torch.cuda.is_available(): self.device = torch.device('cuda:0') else: self.device = torch.device('cpu') self.input_size = input_size _likelihood = GaussianLikelihood() super(GPRegressor, self).__init__(train_inputs=None, train_targets=None, likelihood=_likelihood) self.mean_module = ZeroMean() self.covar_module = ScaleKernel(RBFKernel()) self.input_trans = None self.target_trans = None
def __init__(self, inducing_points, kernel=None): # q(u) variational_distribution = CholeskyVariationalDistribution( inducing_points.size(-1)) # q(f|x) = ∫q(f, u)du = ∫q(f|u, x)q(u)du variational_strategy = VariationalStrategy( self, inducing_points, variational_distribution, learn_inducing_locations=True) super().__init__(variational_strategy) self.mean_module = ConstantMean() if kernel is None: kernel = RBFKernel() self.covar_module = ScaleKernel(kernel)
def __init__(self, train_inputs, train_targets, likelihood, batch_size=1): super(ExactGPModel, self).__init__(train_inputs, train_targets, likelihood) self.mean_module = MultitaskMean(ConstantMean( batch_size=batch_size, prior=gpytorch.priors.SmoothedBoxPrior(-1, 1)), num_tasks=2) self.covar_module = MultitaskKernel( RBFKernel( batch_size=batch_size, log_lengthscale_prior=gpytorch.priors.NormalPrior( loc=torch.zeros(batch_size, 1, 1), scale=torch.ones(batch_size, 1, 1), log_transform=True), ), num_tasks=2, rank=1, )
def __init__(self, NN_num_inputs, pg_estimator, fisher_num_inputs=None, gp_likelihood=None): # fisher_num_inputs is same as svd_low_rank, because of the linear approximation of the Fisher kernel through FastSVD. if pg_estimator == 'MC': nn.Module.__init__(self) else: gpytorch.models.ExactGP.__init__(self, None, None, gp_likelihood) self.NN_num_inputs = NN_num_inputs NN_num_outputs = 10 self.feature_extractor = FeatureExtractor(NN_num_inputs, NN_num_outputs) # value_head is used for computing the state-value function approximation V(s) and subsequently GAE estimates self.value_head = nn.Linear(NN_num_outputs, 1) self.value_head.weight.data.mul_(0.1) self.value_head.bias.data.mul_(0.0) if pg_estimator == 'BQ': grid_size = 128 # Like value_head, the following code constructs the GP head for action-value function approximation Q(s,a) # Note that both V(s) and Q(s,a) share the same feature extractor for the state-values "s". self.mean_module = gpytorch.means.ConstantMean() # First NN_num_outputs indices of GP's input correspond to the state_kernel state_kernel_active_dims = torch.tensor(list( range(NN_num_outputs))) # [NN_num_outputs, GP_input.shape[1]-1] indices of GP's input correspond to the fisher_kernel fisher_kernel_active_dims = torch.tensor( list(range(fisher_num_inputs + NN_num_outputs))[NN_num_outputs:]) self.covar_module_2 = LinearKernel(active_dims=torch.tensor( list(range(fisher_num_inputs + NN_num_outputs))[NN_num_outputs:])) self.covar_module_1 = gpytorch.kernels.AdditiveStructureKernel( gpytorch.kernels.ScaleKernel( gpytorch.kernels.GridInterpolationKernel( RBFKernel(ard_num_dims=1), grid_size=grid_size, num_dims=1)), num_dims=NN_num_outputs, active_dims=torch.tensor(list(range(NN_num_outputs))))
def test_ard_separate_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 0]], [[-1, 1, 2], [2, 1, 4]]], dtype=torch.float) b = torch.tensor([[[1, 3, 1]], [[2, -1, 0]]], dtype=torch.float).repeat(1, 2, 1) lengthscales = torch.tensor([[[1, 2, 1]], [[2, 1, 0.5]]], dtype=torch.float) kernel = RBFKernel(batch_size=2, ard_num_dims=3) kernel.initialize(log_lengthscale=lengthscales.log()) kernel.eval() scaled_a = a.div(lengthscales) scaled_b = b.div(lengthscales) actual = (scaled_a.unsqueeze(-2) - scaled_b.unsqueeze(-3)).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = torch.cat([actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5)
def __init__(self, stem, init_x, init_y, lr, **kwargs): super().__init__() self.stem = stem.to(init_x.device) if init_y.t().shape[0] != 1: _batch_shape = init_y.t().shape[:-1] else: _batch_shape = torch.Size() features = self.stem(init_x) self.gp = SingleTaskGP(features, init_y, covar_module=ScaleKernel( RBFKernel(batch_shape=_batch_shape, ard_num_dims=stem.output_dim), batch_shape=_batch_shape)) self.mll = ExactMarginalLogLikelihood(self.gp.likelihood, self.gp) self.optimizer = torch.optim.Adam(self.parameters(), lr=lr) self._raw_inputs = [init_x] self._target_batch_shape = _batch_shape self.target_dim = init_y.size(-1)
def __init__(self, input_size, target_size, device='cpu'): if device == 'gpu' and torch.cuda.is_available(): self.device = torch.device('cuda:0') else: self.device = torch.device('cpu') self.input_size = input_size self.target_size = target_size _likelihood = MultitaskGaussianLikelihood(num_tasks=self.target_size) super(MultiTaskGPRegressor, self).__init__(train_inputs=None, train_targets=None, likelihood=_likelihood) self.mean_module = MultitaskMean(ZeroMean(), num_tasks=self.target_size) self.covar_module = MultitaskKernel(RBFKernel(), num_tasks=self.target_size, rank=1) self.input_trans = None self.target_trans = None
def create_bayesian_quadrature_iso_gauss(): x1 = torch.from_numpy(np.array([[-1, 1], [0, 0], [-2, 0.1]])) x2 = torch.from_numpy(np.array([[-1, 1], [0, 0], [-2, 0.1], [-3, 3]])) M1 = x1.size()[0] M2 = x2.size()[0] D = x1.size()[1] prior_mean = torch.from_numpy(np.arange(D))[None, :] prior_variance = 2. rbf = RBFKernel() rbf.lengthscale = 1. kernel = ScaleKernel(rbf) kernel.outputscale = 1. bqkernel = QuadratureRBFGaussPrior(kernel, prior_mean, prior_variance) return bqkernel, x1, x2, M1, M2, D
def test_solve(self): size = 100 train_x = Variable(torch.cat([ torch.linspace(0, 1, size).unsqueeze(0), torch.linspace(0, 0.5, size).unsqueeze(0), ], 0)).unsqueeze(-1) covar_matrix = RBFKernel()(train_x, train_x).data piv_chol = pivoted_cholesky.pivoted_cholesky(covar_matrix, 10) woodbury_factor = pivoted_cholesky.woodbury_factor(piv_chol, 1) rhs_vector = torch.randn(2, 100, 5) shifted_covar_matrix = covar_matrix + torch.eye(size) real_solve = torch.cat([ shifted_covar_matrix[0].inverse().matmul(rhs_vector[0]).unsqueeze(0), shifted_covar_matrix[1].inverse().matmul(rhs_vector[1]).unsqueeze(0), ], 0) approx_solve = pivoted_cholesky.woodbury_solve(rhs_vector, piv_chol, woodbury_factor, 1) self.assertTrue(approx_equal(approx_solve, real_solve))
def __init__(self, stem, init_x, init_y, alpha_eps, lr, **kwargs): stem = stem.to(init_x.device) transformed_y, _, sigma2_i = self._transform_targets(init_y, alpha_eps) if transformed_y.t().shape[0] != 1: _batch_shape = transformed_y.t().shape[:-1] else: _batch_shape = torch.Size() features = stem(init_x) gp = FixedNoiseGP(features, transformed_y, sigma2_i, covar_module=ScaleKernel(RBFKernel( batch_shape=_batch_shape, ard_num_dims=stem.output_dim), batch_shape=_batch_shape)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) super().__init__(stem, gp, mll, alpha_eps, lr) self._raw_inputs = [init_x] self._target_batch_shape = _batch_shape
def test_solve_qr_constant_noise(self, dtype=torch.float64, tol=1e-8): size = 50 X = torch.rand((size, 2)).to(dtype=dtype) y = torch.sin(torch.sum(X, 1)).unsqueeze(-1).to(dtype=dtype) noise = 1e-2 * torch.ones(size, dtype=dtype) lazy_tsr = RBFKernel().to(dtype=dtype)(X).evaluate_kernel().add_diag(noise) precondition_qr, _, logdet_qr = lazy_tsr._preconditioner() F = lazy_tsr._piv_chol_self M = noise.diag() + F.matmul(F.t()) x_exact = torch.solve(y, M)[0] x_qr = precondition_qr(y) self.assertTrue(approx_equal(x_exact, x_qr, tol)) logdet = 2 * torch.cholesky(M).diag().log().sum(-1) self.assertTrue(approx_equal(logdet, logdet_qr, tol))
def test_computes_radial_basis_function_gradient(self): a = torch.Tensor([4, 2, 8]).view(3, 1) b = torch.Tensor([0, 2, 2]).view(3, 1) lengthscale = 2 kernel = RBFKernel().initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() param = math.log(lengthscale) * torch.ones(3, 3) param.requires_grad_() diffs = a.expand(3, 3) - b.expand(3, 3).transpose(0, 1) actual_output = (-0.5 * (diffs / param.exp()) ** 2).exp() actual_output.backward(gradient=torch.eye(3)) actual_param_grad = param.grad.sum() output = kernel(a, b).evaluate() output.backward(gradient=torch.eye(3)) res = kernel.log_lengthscale.grad self.assertLess(torch.norm(res - actual_param_grad), 1e-5)
def test_subset_active_compute_radial_basis_function(self): a = torch.tensor([4, 2, 8], dtype=torch.float).view(3, 1) a_p = torch.tensor([1, 2, 3], dtype=torch.float).view(3, 1) a = torch.cat((a, a_p), 1) b = torch.tensor([0, 2, 4], dtype=torch.float).view(3, 1) lengthscale = 2 kernel = RBFKernel(active_dims=[0]) kernel.initialize(lengthscale=lengthscale) kernel.eval() actual = torch.tensor([[16, 4, 0], [4, 0, 4], [64, 36, 16]], dtype=torch.float) actual.mul_(-0.5).div_(lengthscale ** 2).exp_() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = actual.diag() self.assertLess(torch.norm(res - actual), 1e-5)
def test_standard(self): base_kernel = RBFKernel() kernel = GridInterpolationKernel(base_kernel, num_dims=2, grid_size=128, grid_bounds=[(-1.2, 1.2)] * 2) xs = torch.randn(5, 2).clamp(-1, 1) interp_covar = kernel(xs, xs).evaluate_kernel() self.assertIsInstance(interp_covar, InterpolatedLazyTensor) xs = torch.randn(5, 2).clamp(-1, 1) grid_eval = kernel(xs, xs).evaluate() actual_eval = base_kernel(xs, xs).evaluate() self.assertLess(torch.norm(grid_eval - actual_eval), 2e-5) xs = torch.randn(3, 5, 2).clamp(-1, 1) grid_eval = kernel(xs, xs).evaluate() actual_eval = base_kernel(xs, xs).evaluate() self.assertLess(torch.norm(grid_eval - actual_eval), 2e-5)
def __init__(self, x_train, y_train, likelihood): """ Takes training data and likelihood and constructs objects neccessary for 'forward' module. Commonly mean and kernel module. """ super(GP, self).__init__(x_train, y_train, likelihood) #self.mean_module = gpytorch.means.ConstantMean() self.mean_module = LinearMean(2) # prior_ls = gpytorch.priors.NormalPrior(3, 3) # prior_os = gpytorch.priors.NormalPrior(4, 3) # self.covar_module = gpytorch.kernels.ScaleKernel( # gpytorch.kernels.RBFKernel(lengthscale_prior=prior_ls), # outputscale_prior=prior_os) lengthscale_prior = gpytorch.priors.NormalPrior(0.1, 2.0) outputscale_prior = gpytorch.priors.NormalPrior(1.0, 3.0) lengthscale_constraint = None self.covar_module = ScaleKernel(RBFKernel( lengthscale_constraint=lengthscale_constraint, lengthscale_prior=lengthscale_prior), outputscale_prior=outputscale_prior)
def __init__(self, input_size, target_size, device='cpu'): if device == 'gpu' and torch.cuda.is_available(): self.device = torch.device('cuda:0') else: self.device = torch.device('cpu') self.input_size = input_size self.target_size = target_size _likelihood = MultitaskGaussianLikelihood(num_tasks=self.target_size) super(GPListRegressor, self).__init__(train_inputs=None, train_targets=None, likelihood=_likelihood) self.mean_module = ConstantMean(batch_shape=torch.Size([self.target_size])) self.covar_module = ScaleKernel(RBFKernel(batch_shape=torch.Size([self.target_size])), batch_shape=torch.Size([self.target_size])) self.input_trans = None self.target_trans = None