def test_interpolated_toeplitz_gp_marginal_log_likelihood_forward(): x = Variable(torch.linspace(0, 1, 5)) y = torch.randn(5) noise = torch.Tensor([1e-4]) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.initialize_interpolation_grid(10, grid_bounds=(0, 1)) covar_x = covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)) c = covar_x.c.data T = utils.toeplitz.sym_toeplitz(c) W_left = index_coef_to_sparse(covar_x.J_left, covar_x.C_left, len(c)) W_right = index_coef_to_sparse(covar_x.J_right, covar_x.C_right, len(c)) W_left_dense = W_left.to_dense() W_right_dense = W_right.to_dense() WTW = W_left_dense.matmul(T.matmul(W_right_dense.t())) + torch.eye(len(x)) * 1e-4 quad_form_actual = y.dot(WTW.inverse().matmul(y)) chol_T = torch.potrf(WTW) log_det_actual = chol_T.diag().log().sum() * 2 actual = -0.5 * (log_det_actual + quad_form_actual + math.log(2 * math.pi) * len(y)) res = InterpolatedToeplitzGPMarginalLogLikelihood(W_left, W_right, num_samples=1000)(Variable(c), Variable(y), Variable(noise)).data assert all(torch.abs((res - actual) / actual) < 0.05)
def test_ard_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 0]], [[-1, 1, 2], [2, 1, 4]]], dtype=torch.float) b = torch.tensor([[[1, 3, 1]], [[2, -1, 0]]], dtype=torch.float).repeat(1, 2, 1) lengthscales = torch.tensor([[[1, 2, 1]]], dtype=torch.float) kernel = RBFKernel(batch_shape=torch.Size([2]), ard_num_dims=3) kernel.initialize(lengthscale=lengthscales) kernel.eval() scaled_a = a.div(lengthscales) scaled_b = b.div(lengthscales) actual = (scaled_a.unsqueeze(-2) - scaled_b.unsqueeze(-3)).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = actual.diagonal(dim1=-1, dim2=-2) self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims double_batch_a = scaled_a.transpose(-1, -2).unsqueeze(-1) double_batch_b = scaled_b.transpose(-1, -2).unsqueeze(-2) actual = double_batch_a - double_batch_b actual = actual.pow(2).mul_(-0.5).exp() res = kernel(a, b, last_dim_is_batch=True).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims and diag res = kernel(a, b, last_dim_is_batch=True).diag() actual = actual.diagonal(dim1=-2, dim2=-1) self.assertLess(torch.norm(res - actual), 1e-5)
def test_ard_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 0]], [[-1, 1, 2], [2, 1, 4]]], dtype=torch.float) b = torch.tensor([[[1, 3, 1]], [[2, -1, 0]]], dtype=torch.float).repeat(1, 2, 1) lengthscales = torch.tensor([[[1, 2, 1]]], dtype=torch.float) kernel = RBFKernel(batch_size=2, ard_num_dims=3) kernel.initialize(log_lengthscale=lengthscales.log()) kernel.eval() scaled_a = a.div(lengthscales) scaled_b = b.div(lengthscales) actual = (scaled_a.unsqueeze(-2) - scaled_b.unsqueeze(-3)).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = torch.cat([actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims actual = scaled_a.transpose(-1, -2).unsqueeze(-1) - scaled_b.transpose(-1, -2).unsqueeze(-2) actual = actual.pow(2).mul_(-0.5).exp().view(6, 2, 2) res = kernel(a, b, batch_dims=(0, 2)).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims and diag res = kernel(a, b, batch_dims=(0, 2)).diag() actual = torch.cat([actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5)
def test_ard(self): a = torch.tensor([[[1, 2], [2, 4]]], dtype=torch.float).repeat(2, 1, 1) b = torch.tensor([[[1, 3], [0, 4]]], dtype=torch.float).repeat(2, 1, 1) lengthscales = torch.tensor([1, 2], dtype=torch.float).view(1, 1, 2) base_kernel = RBFKernel(ard_num_dims=2) base_kernel.initialize(lengthscale=lengthscales) kernel = ScaleKernel(base_kernel) kernel.initialize(outputscale=torch.tensor([3], dtype=torch.float)) kernel.eval() scaled_a = a.div(lengthscales) scaled_b = b.div(lengthscales) actual = (scaled_a.unsqueeze(-2) - scaled_b.unsqueeze(-3)).pow(2).sum(dim=-1).mul_(-0.5).exp() actual.mul_(3) res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # Diag res = kernel(a, b).diag() actual = torch.cat([actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims actual = scaled_a.transpose(-1, -2).unsqueeze(-1) - scaled_b.transpose(-1, -2).unsqueeze(-2) actual = actual.pow(2).mul_(-0.5).exp().view(4, 2, 2) actual.mul_(3) res = kernel(a, b, batch_dims=(0, 2)).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims and diag res = kernel(a, b, batch_dims=(0, 2)).diag() actual = torch.cat([actual[i].diag().unsqueeze(0) for i in range(actual.size(0))]) self.assertLess(torch.norm(res - actual), 1e-5)
def test_toeplitz_mvn_kl_divergence_forward(): x = Variable(torch.linspace(0, 1, 5)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.initialize_interpolation_grid(10, grid_bounds=(0, 1)) covar_x = covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)) c = Variable(covar_x.c.data, requires_grad=True) mu1 = Variable(torch.randn(10), requires_grad=True) mu2 = Variable(torch.randn(10), requires_grad=True) T = Variable(torch.zeros(len(c), len(c))) for i in range(len(c)): for j in range(len(c)): T[i, j] = utils.toeplitz.toeplitz_getitem(c, c, i, j) U = torch.randn(10, 10).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) actual = gpytorch.mvn_kl_divergence(mu1, U, mu2, T, num_samples=1000) res = gpytorch.mvn_kl_divergence(mu1, U, mu2, covar_x, num_samples=1000) assert all(torch.abs((res.data - actual.data) / actual.data) < 0.15)
def test_ard(self): a = torch.tensor([[1, 2], [2, 4]], dtype=torch.float) b = torch.tensor([[1, 3], [0, 4]], dtype=torch.float) lengthscales = torch.tensor([1, 2], dtype=torch.float).view(1, 2) kernel = RBFKernel(ard_num_dims=2) kernel.initialize(lengthscale=lengthscales) kernel.eval() scaled_a = a.div(lengthscales) scaled_b = b.div(lengthscales) actual = (scaled_a.unsqueeze(-2) - scaled_b.unsqueeze(-3)).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # Diag res = kernel(a, b).diag() actual = actual.diag() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims actual = scaled_a.transpose(-1, -2).unsqueeze(-1) - scaled_b.transpose( -1, -2).unsqueeze(-2) actual = actual.pow(2).mul_(-0.5).exp() res = kernel(a, b, last_dim_is_batch=True).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # batch_dims and diag res = kernel(a, b, last_dim_is_batch=True).diag() actual = actual.diagonal(dim1=-1, dim2=-2) self.assertLess(torch.norm(res - actual), 1e-5)
def test_interpolated_toeplitz_gp_marginal_log_likelihood_backward(): x = Variable(torch.linspace(0, 1, 5)) y = Variable(torch.randn(5), requires_grad=True) noise = Variable(torch.Tensor([1e-4]), requires_grad=True) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.eval() covar_module.initialize_interpolation_grid(10, [(0, 1)]) covar_x = covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)) c = Variable(covar_x.c.data, requires_grad=True) W_left = index_coef_to_sparse(covar_x.J_left, covar_x.C_left, len(c)) W_right = index_coef_to_sparse(covar_x.J_right, covar_x.C_right, len(c)) W_left_dense = Variable(W_left.to_dense()) W_right_dense = Variable(W_right.to_dense()) T = Variable(torch.zeros(len(c), len(c))) for i in range(len(c)): for j in range(len(c)): T[i, j] = utils.toeplitz.sym_toeplitz_getitem(c, i, j) WTW = W_left_dense.matmul(T.matmul( W_right_dense.t())) + Variable(torch.eye(len(x))) * noise quad_form_actual = y.dot(WTW.inverse().matmul(y)) log_det_actual = _det(WTW).log() actual_nll = -0.5 * (log_det_actual + quad_form_actual + math.log(2 * math.pi) * len(y)) actual_nll.backward() actual_c_grad = c.grad.data.clone() actual_y_grad = y.grad.data.clone() actual_noise_grad = noise.grad.data.clone() c.grad.data.fill_(0) y.grad.data.fill_(0) noise.grad.data.fill_(0) covar_x = gpytorch.lazy.ToeplitzLazyVariable(c, covar_x.J_left, covar_x.C_left, covar_x.J_right, covar_x.C_right, noise) res = covar_x.exact_gp_marginal_log_likelihood(y) res.backward() res_c_grad = covar_x.c.grad.data res_y_grad = y.grad.data res_noise_grad = noise.grad.data assert (actual_c_grad - res_c_grad).norm() / res_c_grad.norm() < 0.05 assert (actual_y_grad - res_y_grad).norm() / res_y_grad.norm() < 1e-3 assert (actual_noise_grad - res_noise_grad).norm() / res_noise_grad.norm() < 1e-3
def test_toeplitz_mvn_kl_divergence_backward(): x = Variable(torch.linspace(0, 1, 5)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.initialize_interpolation_grid(4, grid_bounds=(0, 1)) covar_x = covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)) covar_x.c = Variable(covar_x.c.data, requires_grad=True) c = covar_x.c mu1 = Variable(torch.randn(4), requires_grad=True) mu2 = Variable(torch.randn(4), requires_grad=True) mu_diff = mu2 - mu1 T = Variable(torch.zeros(len(c), len(c))) for i in range(len(c)): for j in range(len(c)): T[i, j] = utils.toeplitz.toeplitz_getitem(c, c, i, j) U = torch.randn(4, 4).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) actual = 0.5 * (_det(T).log() + mu_diff.dot(T.inverse().mv(mu_diff)) + T.inverse().mm(U.t().mm(U)).trace() - U.diag().log().sum(0) * 2 - len(mu_diff)) actual.backward() actual_c_grad = c.grad.data.clone() actual_mu1_grad = mu1.grad.data.clone() actual_mu2_grad = mu2.grad.data.clone() actual_U_grad = U.grad.data.clone() c.grad.data.fill_(0) mu1.grad.data.fill_(0) mu2.grad.data.fill_(0) U.grad.data.fill_(0) res = gpytorch.mvn_kl_divergence(mu1, U, mu2, covar_x, num_samples=1000) res.backward() res_c_grad = c.grad.data res_mu1_grad = mu1.grad.data res_mu2_grad = mu2.grad.data res_U_grad = U.grad.data assert torch.abs( (res_c_grad - actual_c_grad)).sum() / actual_c_grad.abs().sum() < 1e-1 assert torch.abs( (res_mu1_grad - actual_mu1_grad)).sum() / actual_mu1_grad.abs().sum() < 1e-5 assert torch.abs( (res_mu2_grad - actual_mu2_grad)).sum() / actual_mu2_grad.abs().sum() < 1e-5 assert torch.abs( (res_U_grad - actual_U_grad)).sum() / actual_U_grad.abs().sum() < 1e-2
def test_inherit_active_dims(self): lengthscales = torch.tensor([1, 1], dtype=torch.float) base_kernel = RBFKernel(active_dims=(1, 2), ard_num_dims=2) base_kernel.initialize(lengthscale=lengthscales) kernel = ScaleKernel(base_kernel) kernel.initialize(outputscale=torch.tensor([3], dtype=torch.float)) kernel.eval() self.assertTrue( torch.all(kernel.active_dims == base_kernel.active_dims))
def test_trace_logdet_quad_form_factory(): x = Variable(torch.linspace(0, 1, 10)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) covar_module = GridInterpolationKernel(rbf_covar) covar_module.initialize_interpolation_grid(4, grid_bounds=(0, 1)) c = Variable(covar_module.forward(x.unsqueeze(1), x.unsqueeze(1)).c.data, requires_grad=True) T = Variable(torch.zeros(4, 4)) for i in range(4): for j in range(4): T[i, j] = utils.toeplitz.toeplitz_getitem(c, c, i, j) U = torch.randn(4, 4).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) mu_diff = Variable(torch.randn(4), requires_grad=True) actual = _det(T).log() + mu_diff.dot( T.inverse().mv(mu_diff)) + T.inverse().mm(U.t().mm(U)).trace() actual.backward() actual_c_grad = c.grad.data actual_mu_diff_grad = mu_diff.grad.data actual_U_grad = U.grad.data c.grad.data.fill_(0) mu_diff.grad.data.fill_(0) U.grad.data.fill_(0) def _mm_closure_factory(*args): c, = args return lambda mat2: utils.toeplitz.sym_toeplitz_mm(c, mat2) def _derivative_quadratic_form_factory(*args): return lambda left_vector, right_vector: ( sym_toeplitz_derivative_quadratic_form(left_vector, right_vector ), ) covar_args = (c, ) res = trace_logdet_quad_form_factory( _mm_closure_factory, _derivative_quadratic_form_factory)(num_samples=1000)(mu_diff, U, *covar_args) res.backward() res_c_grad = c.grad.data res_mu_diff_grad = mu_diff.grad.data res_U_grad = U.grad.data assert all(torch.abs((res.data - actual.data) / actual.data) < 0.15) assert utils.approx_equal(res_c_grad, actual_c_grad) assert utils.approx_equal(res_mu_diff_grad, actual_mu_diff_grad) assert utils.approx_equal(res_U_grad, actual_U_grad)
def test_ard(self): a = torch.Tensor([[1, 2], [2, 4]]) b = torch.Tensor([1, 3]).view(1, 1, 2) lengthscales = torch.Tensor([1, 2]).view(1, 1, 2) kernel = RBFKernel(ard_num_dims=2) kernel.initialize(log_lengthscale=lengthscales.log()) kernel.eval() actual = (a - b).div_(lengthscales).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual.unsqueeze(-1)), 1e-5)
def test_ard_batch(self): a = torch.tensor([[[1, 2, 3], [2, 4, 0]], [[-1, 1, 2], [2, 1, 4]]], dtype=torch.float) b = torch.tensor([[[1, 3, 1]], [[2, -1, 0]]], dtype=torch.float) lengthscales = torch.tensor([[[1, 2, 1]]], dtype=torch.float) kernel = RBFKernel(batch_size=2, ard_num_dims=3) kernel.initialize(log_lengthscale=lengthscales.log()) kernel.eval() actual = (a - b).div_(lengthscales).pow(2).sum(dim=-1).mul_(-0.5).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual.unsqueeze(-1)), 1e-5)
def test_subset_active_compute_radial_basis_function(self): a = torch.Tensor([4, 2, 8]).view(3, 1) a_p = torch.Tensor([1, 2, 3]).view(3, 1) a = torch.cat((a, a_p), 1) b = torch.Tensor([0, 2]).view(2, 1) lengthscale = 2 kernel = RBFKernel(active_dims=[0]) kernel.initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() actual = torch.Tensor([[16, 4], [4, 0], [64, 36]]).mul_(-0.5).div_(lengthscale ** 2).exp() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5)
def test_kp_toeplitz_gp_marginal_log_likelihood_forward(): x = torch.cat([Variable(torch.linspace(0, 1, 2)).unsqueeze(1)] * 3, 1) y = torch.randn(2) rbf_module = RBFKernel() rbf_module.initialize(log_lengthscale=-2) covar_module = GridInterpolationKernel(rbf_module) covar_module.eval() covar_module.initialize_interpolation_grid(5, [(0, 1), (0, 1), (0, 1)]) kronecker_var = covar_module.forward(x, x) kronecker_var_eval = kronecker_var.evaluate() res = kronecker_var.exact_gp_marginal_log_likelihood(Variable(y)).data actual = gpytorch.exact_gp_marginal_log_likelihood(kronecker_var_eval, Variable(y)).data assert all(torch.abs((res - actual) / actual) < 0.05)
def test_mvn_kl_divergence_backward(): x = Variable(torch.linspace(0, 1, 4)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) K = Variable(rbf_covar.forward(x.unsqueeze(1), x.unsqueeze(1)).data, requires_grad=True) mu1 = Variable(torch.randn(4), requires_grad=True) mu2 = Variable(torch.randn(4), requires_grad=True) U = torch.randn(4, 4).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) mu_diff = mu2 - mu1 actual = 0.5 * (_det(K).log() + mu_diff.dot(K.inverse().mv(mu_diff)) + K.inverse().mm(U.t().mm(U)).trace() - U.diag().log().sum(0) * 2 - len(mu_diff)) actual.backward() actual_K_grad = K.grad.data.clone() actual_mu1_grad = mu1.grad.data.clone() actual_mu2_grad = mu2.grad.data.clone() actual_U_grad = U.grad.data.clone() K.grad.data.fill_(0) mu1.grad.data.fill_(0) mu2.grad.data.fill_(0) U.grad.data.fill_(0) res = gpytorch.mvn_kl_divergence(mu1, U, mu2, K, num_samples=10000) res.backward() res_K_grad = K.grad.data res_mu1_grad = mu1.grad.data res_mu2_grad = mu2.grad.data res_U_grad = U.grad.data assert torch.abs( (res_K_grad - actual_K_grad)).sum() / actual_K_grad.abs().sum() < 1e-1 assert torch.abs( (res_mu1_grad - actual_mu1_grad)).sum() / actual_mu1_grad.abs().sum() < 1e-5 assert torch.abs( (res_mu2_grad - actual_mu2_grad)).sum() / actual_mu2_grad.abs().sum() < 1e-5 assert torch.abs( (res_U_grad - actual_U_grad)).sum() / actual_U_grad.abs().sum() < 1e-2
def test_ard(self): base_k = RBFKernel(ard_num_dims=3) base_k.initialize(lengthscale=[1., 2., 3.]) AddK = NewtonGirardAdditiveKernel(base_k, 3, max_degree=1) testvals = torch.tensor([[1, 2, 3], [7, 5, 2]], dtype=torch.float) add_k_val = AddK(testvals, testvals).evaluate() ks = [] for i in range(3): k = RBFKernel(active_dims=i) k.initialize(lengthscale=i + 1) ks.append(k) manual_k = ScaleKernel(AdditiveKernel(*ks)) manual_k.initialize(outputscale=1.) manual_add_k_val = manual_k(testvals, testvals).evaluate() # np.testing.assert_allclose(add_k_val.detach().numpy(), manual_add_k_val.detach().numpy(), atol=1e-5) self.assertTrue(torch.allclose(add_k_val, manual_add_k_val, atol=1e-5))
def test_mvn_kl_divergence_forward(): x = Variable(torch.linspace(0, 1, 4)) rbf_covar = RBFKernel() rbf_covar.initialize(log_lengthscale=-4) K = rbf_covar.forward(x.unsqueeze(1), x.unsqueeze(1)) mu1 = Variable(torch.randn(4), requires_grad=True) mu2 = Variable(torch.randn(4), requires_grad=True) U = torch.randn(4, 4).triu() U = Variable(U.mul(U.diag().sign().unsqueeze(1).expand_as(U).triu()), requires_grad=True) mu_diff = mu2 - mu1 actual = 0.5 * (_det(K).log() + mu_diff.dot(K.inverse().mv(mu_diff)) + K.inverse().mm(U.t().mm(U)).trace() - U.diag().log().sum(0) * 2 - len(mu_diff)) res = gpytorch.mvn_kl_divergence(mu1, U, mu2, K, num_samples=1000) assert all(torch.abs((res.data - actual.data) / actual.data) < 0.15)
def test_subset_active_compute_radial_basis_function(self): a = torch.tensor([4, 2, 8], dtype=torch.float).view(3, 1) a_p = torch.tensor([1, 2, 3], dtype=torch.float).view(3, 1) a = torch.cat((a, a_p), 1) b = torch.tensor([0, 2, 4], dtype=torch.float).view(3, 1) lengthscale = 2 kernel = RBFKernel(active_dims=[0]) kernel.initialize(lengthscale=lengthscale) kernel.eval() actual = torch.tensor([[16, 4, 0], [4, 0, 4], [64, 36, 16]], dtype=torch.float) actual.mul_(-0.5).div_(lengthscale ** 2).exp_() res = kernel(a, b).evaluate() self.assertLess(torch.norm(res - actual), 1e-5) # diag res = kernel(a, b).diag() actual = actual.diag() self.assertLess(torch.norm(res - actual), 1e-5)
def test_subset_active_computes_radial_basis_function_gradient(self): a_1 = torch.Tensor([4, 2, 8]).view(3, 1) a_p = torch.Tensor([1, 2, 3]).view(3, 1) a = torch.cat((a_1, a_p), 1) b = torch.Tensor([0, 2, 2]).view(3, 1) lengthscale = 2 param = math.log(lengthscale) * torch.ones(3, 3) param.requires_grad_() diffs = a_1.expand(3, 3) - b.expand(3, 3).transpose(0, 1) actual_output = (-0.5 * (diffs / param.exp()) ** 2).exp() actual_output.backward(torch.eye(3)) actual_param_grad = param.grad.sum() kernel = RBFKernel(active_dims=[0]) kernel.initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() output = kernel(a, b).evaluate() output.backward(gradient=torch.eye(3)) res = kernel.log_lengthscale.grad self.assertLess(torch.norm(res - actual_param_grad), 1e-5)
def test_postscale(self): x = torch.tensor([[1., 2., 3.], [1.1, 2.2, 3.3]]) kbase = RBFKernel() kbase.initialize(lengthscale=torch.tensor([1.])) base_kernel = AdditiveStructureKernel(kbase, 3) proj_module = torch.nn.Linear(3, 3, bias=False) proj_module.weight.data = torch.eye(3, dtype=torch.float) proj_kernel = ScaledProjectionKernel(proj_module, base_kernel, prescale=False, ard_num_dims=3) proj_kernel.initialize(lengthscale=torch.tensor([1., 2., 3.])) with torch.no_grad(): K = proj_kernel(x, x).evaluate() k = RBFKernel() k.initialize(lengthscale=torch.tensor([1.])) with torch.no_grad(): K2 = 3 * k(x[:, 0:1], x[:, 0:1]).evaluate() np.testing.assert_allclose(K.numpy(), K2.numpy())
def test_subset_active_computes_radial_basis_function_gradient(self): a_1 = torch.Tensor([4, 2, 8]).view(3, 1) a_p = torch.Tensor([1, 2, 3]).view(3, 1) a = torch.cat((a_1, a_p), 1) b = torch.Tensor([0, 2, 2]).view(3, 1) lengthscale = 2 kernel = RBFKernel(active_dims=[0]) kernel.initialize(log_lengthscale=math.log(lengthscale)) kernel.eval() param = Variable( torch.Tensor(3, 3).fill_(math.log(lengthscale)), requires_grad=True, ) output = kernel(Variable(a), Variable(b)) output.backward(gradient=torch.eye(3)) res = kernel.log_lengthscale.grad.data diffs = Variable(a_1.expand(3, 3) - b.expand(3, 3).transpose(0, 1)) actual_output = (-(diffs ** 2) / (param.exp())).exp() actual_output.backward(torch.eye(3)) actual_param_grad = param.grad.data.sum() self.assertLess(torch.norm(res - actual_param_grad), 1e-5)
def test_gradients(self): x = torch.tensor([[1., 2., 3.], [1.1, 2.2, 3.3]]) y = torch.sin(x).sum(dim=1) kbase = RBFKernel() kbase.initialize(lengthscale=torch.tensor([1.])) base_kernel = AdditiveStructureKernel(kbase, 3) proj_module = torch.nn.Linear(3, 3, bias=False) proj_module.weight.data = torch.eye(3, dtype=torch.float) proj_kernel = ScaledProjectionKernel(proj_module, base_kernel, prescale=True, ard_num_dims=3) proj_kernel.initialize(lengthscale=torch.tensor([1., 2., 3.])) model = ExactGPModel(x, y, gpytorch.likelihoods.GaussianLikelihood(), proj_kernel) mll = gpytorch.mlls.ExactMarginalLogLikelihood(model.likelihood, model) optimizer_ = torch.optim.Adam(model.parameters(), lr=0.1) optimizer_.zero_grad() pred = model(x) loss = -mll(pred, y) loss.backward() optimizer_.step() np.testing.assert_allclose( proj_kernel.base_kernel.base_kernel.lengthscale.numpy(), torch.tensor([[1.]]).numpy()) np.testing.assert_allclose( proj_kernel.projection_module.weight.numpy(), torch.eye(3, dtype=torch.float).numpy()) self.assertFalse( np.allclose(proj_kernel.lengthscale.detach().numpy(), torch.tensor([1., 2., 3.]).numpy())) proj_module = torch.nn.Linear(3, 3, bias=False) proj_module.weight.data = torch.eye(3, dtype=torch.float) proj_kernel2 = ScaledProjectionKernel(proj_module, base_kernel, prescale=True, ard_num_dims=3, learn_proj=True) proj_kernel2.initialize(lengthscale=torch.tensor([1., 2., 3.])) model = ExactGPModel(x, y, gpytorch.likelihoods.GaussianLikelihood(), proj_kernel2) mll = gpytorch.mlls.ExactMarginalLogLikelihood(model.likelihood, model) optimizer_ = torch.optim.Adam(model.parameters(), lr=0.1) optimizer_.zero_grad() pred = model(x) loss = -mll(pred, y) loss.backward() optimizer_.step() np.testing.assert_allclose( proj_kernel2.base_kernel.base_kernel.lengthscale.numpy(), torch.tensor([[1.]]).numpy()) self.assertFalse( np.allclose(proj_kernel2.projection_module.weight.detach().numpy(), torch.eye(3, dtype=torch.float).numpy())) self.assertFalse( np.allclose(proj_kernel2.lengthscale.detach().numpy(), torch.tensor([1., 2., 3.]).numpy()))
def foo_kp_toeplitz_gp_marginal_log_likelihood_backward(): x = torch.cat([Variable(torch.linspace(0, 1, 2)).unsqueeze(1)] * 3, 1) y = Variable(torch.randn(2), requires_grad=True) rbf_module = RBFKernel() rbf_module.initialize(log_lengthscale=-2) covar_module = GridInterpolationKernel(rbf_module) covar_module.eval() covar_module.initialize_interpolation_grid(5, [(0, 1), (0, 1), (0, 1)]) kronecker_var = covar_module.forward(x, x) cs = Variable(torch.zeros(3, 5), requires_grad=True) J_lefts = [] C_lefts = [] J_rights = [] C_rights = [] Ts = [] for i in range(3): covar_x = covar_module.forward(x[:, i].unsqueeze(1), x[:, i].unsqueeze(1)) cs.data[i] = covar_x.c.data J_lefts.append(covar_x.J_left) C_lefts.append(covar_x.C_left) J_rights.append(covar_x.J_right) C_rights.append(covar_x.C_right) T = Variable(torch.zeros(len(cs[i].data), len(cs[i].data))) for k in range(len(cs[i].data)): for j in range(len(cs[i].data)): T[k, j] = utils.toeplitz.toeplitz_getitem(cs[i], cs[i], k, j) Ts.append(T) W_left = list_of_indices_and_values_to_sparse(J_lefts, C_lefts, cs) W_right = list_of_indices_and_values_to_sparse(J_rights, C_rights, cs) W_left_dense = Variable(W_left.to_dense()) W_right_dense = Variable(W_right.to_dense()) K = kronecker_product(Ts) WKW = W_left_dense.matmul(K.matmul(W_right_dense.t())) quad_form_actual = y.dot(WKW.inverse().matmul(y)) log_det_actual = _det(WKW).log() actual_nll = -0.5 * (log_det_actual + quad_form_actual + math.log(2 * math.pi) * len(y)) actual_nll.backward() actual_cs_grad = cs.grad.data.clone() actual_y_grad = y.grad.data.clone() y.grad.data.fill_(0) cs.grad.data.fill_(0) kronecker_var = gpytorch.lazy.kroneckerProductLazyVariable( cs, kronecker_var.J_lefts, kronecker_var.C_lefts, kronecker_var.J_rights, kronecker_var.C_rights) gpytorch.functions.num_trace_samples = 100 res = kronecker_var.exact_gp_marginal_log_likelihood(y) res.backward() res_cs_grad = covar_x.cs.grad.data res_y_grad = y.grad.data assert (actual_cs_grad - res_cs_grad).norm() / res_cs_grad.norm() < 0.05 assert (actual_y_grad - res_y_grad).norm() / res_y_grad.norm() < 1e-3 y.grad.data.fill_(0) cs.grad.data.fill_(0) gpytorch.functions.fastest = False res = kronecker_var.exact_gp_marginal_log_likelihood(y) res.backward() res_cs_grad = covar_x.cs.grad.data res_y_grad = y.grad.data assert (actual_cs_grad - res_cs_grad).norm() / res_cs_grad.norm() < 1e-3 assert (actual_y_grad - res_y_grad).norm() / res_y_grad.norm() < 1e-3
def test_initialize_lengthscale_batch(self): kernel = RBFKernel(batch_size=2) ls_init = torch.tensor([3.14, 4.13]) kernel.initialize(lengthscale=ls_init) actual_value = ls_init.view_as(kernel.lengthscale) self.assertLess(torch.norm(kernel.lengthscale - actual_value), 1e-5)
def test_initialize_lengthscale(self): kernel = RBFKernel() kernel.initialize(lengthscale=3.14) actual_value = torch.tensor(3.14).view_as(kernel.lengthscale) self.assertLess(torch.norm(kernel.lengthscale - actual_value), 1e-5)