def test_fixed_noise_gaussian_likelihood(self, cuda=False):
     device = torch.device("cuda") if cuda else torch.device("cpu")
     for dtype in (torch.float, torch.double):
         noise = 0.1 + torch.rand(4, device=device, dtype=dtype)
         lkhd = FixedNoiseGaussianLikelihood(noise=noise)
         # test basics
         self.assertIsInstance(lkhd.noise_covar, FixedGaussianNoise)
         self.assertTrue(torch.equal(noise, lkhd.noise))
         new_noise = 0.1 + torch.rand(4, device=device, dtype=dtype)
         lkhd.noise = new_noise
         self.assertTrue(torch.equal(lkhd.noise, new_noise))
         # test __call__
         mean = torch.zeros(4, device=device, dtype=dtype)
         covar = DiagLazyTensor(torch.ones(4, device=device, dtype=dtype))
         mvn = MultivariateNormal(mean, covar)
         out = lkhd(mvn)
         self.assertTrue(torch.allclose(out.variance, 1 + new_noise))
         # things should break if dimensions mismatch
         mean = torch.zeros(5, device=device, dtype=dtype)
         covar = DiagLazyTensor(torch.ones(5, device=device, dtype=dtype))
         mvn = MultivariateNormal(mean, covar)
         with self.assertWarns(UserWarning):
             lkhd(mvn)
         # test __call__ w/ observation noise
         obs_noise = 0.1 + torch.rand(5, device=device, dtype=dtype)
         out = lkhd(mvn, noise=obs_noise)
         self.assertTrue(torch.allclose(out.variance, 1 + obs_noise))
示例#2
0
    def test_posterior_latent_gp_and_likelihood_with_optimization(
            self, cuda=False):
        # This test throws a warning because the fixed noise likelihood gets the wrong input
        warnings.simplefilter("ignore", GPInputWarning)

        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        # We're manually going to set the hyperparameters to something they shouldn't be
        likelihood = FixedNoiseGaussianLikelihood(torch.ones(11) * 0.001)
        gp_model = ExactGPModel(train_x, train_y, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.rbf_covar_module.initialize(lengthscale=exp(1))
        gp_model.mean_module.initialize(constant=0)

        if cuda:
            gp_model.cuda()
            likelihood.cuda()

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.1)
        optimizer.n_iter = 0
        with gpytorch.settings.debug(False):
            for _ in range(75):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            optimizer.step()

            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_function_predictions = likelihood(gp_model(test_x))
            mean_abs_error = torch.mean(
                torch.abs(test_y - test_function_predictions.mean))

        self.assertLess(mean_abs_error.squeeze().item(), 0.05)
示例#3
0
    def setUp(self, batched=False, learnable=False):
        torch.set_default_tensor_type(torch.DoubleTensor)
        torch.random.manual_seed(10)

        train_x = torch.rand(10, 2)
        train_y = torch.sin(2 * train_x[:, 0] + 3 * train_x[:, 1]).unsqueeze(-1)
        train_y_var = 0.1 * torch.ones_like(train_y)
        if batched:
            train_y = torch.cat(
                (
                    train_y, 
                    train_y + 0.3 * torch.randn_like(train_y),
                    train_y + 0.3 * torch.randn_like(train_y),
                ),
                dim=1
            )
            train_y_var = train_y_var.repeat(1, 3)

        model = FixedNoiseOnlineSKIGP(
            train_inputs=train_x,
            train_targets=train_y,
            train_noise_term=train_y_var,
            grid_bounds=torch.tensor([[0.0, 1.0], [0.0, 1.0]]),
            grid_size=5,
            learn_additional_noise=learnable
        )
        equivalent_model = SingleTaskGP(
            train_X=train_x, 
            train_Y=train_y, 
            likelihood=FixedNoiseGaussianLikelihood(train_y_var.t(), learn_additional_noise=learnable),
            covar_module = deepcopy(model.covar_module)
        )
        equivalent_model.mean_module = ZeroMean()

        return model, equivalent_model, train_x, train_y
示例#4
0
 def create_model(self, fixed_noise=False):
     data = TestExactGP.create_test_data(self)
     likelihood, labels = TestExactGP.create_likelihood_and_labels(self)
     if fixed_noise:
         noise = 0.1 + 0.2 * torch.rand_like(labels)
         likelihood = FixedNoiseGaussianLikelihood(noise)
     return TestExactGP.create_model(self, data, labels, likelihood)
示例#5
0
    def test_fixed_noise_fantasy_updates_batch(self, cuda=False):
        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        noise = torch.full_like(train_y, 2e-4)
        test_noise = torch.full_like(test_y, 3e-4)

        likelihood = FixedNoiseGaussianLikelihood(noise)
        gp_model = ExactGPModel(train_x, train_y, likelihood)
        mll = gpytorch.ExactMarginalLogLikelihood(likelihood, gp_model)
        gp_model.covar_module.base_kernel.initialize(lengthscale=exp(1))
        gp_model.mean_module.initialize(constant=0)

        if cuda:
            gp_model.cuda()
            likelihood.cuda()

        # Find optimal model hyperparameters
        gp_model.train()
        likelihood.train()
        optimizer = optim.Adam(list(gp_model.parameters()) + list(likelihood.parameters()), lr=0.15)
        for _ in range(50):
            optimizer.zero_grad()
            with gpytorch.settings.debug(False):
                output = gp_model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            optimizer.step()

        for param in gp_model.parameters():
            self.assertTrue(param.grad is not None)
            self.assertGreater(param.grad.norm().item(), 0)
        optimizer.step()

        with gpytorch.settings.fast_pred_var():
            # Test the model
            gp_model.eval()
            likelihood.eval()
            test_function_predictions = likelihood(gp_model(test_x), noise=test_noise)

            # Cut data down, and then add back via the fantasy interface
            gp_model.set_train_data(train_x[:5], train_y[:5], strict=False)
            gp_model.likelihood.noise_covar = FixedGaussianNoise(noise=noise[:5])
            likelihood(gp_model(test_x), noise=test_noise)

            fantasy_x = train_x[5:].clone().unsqueeze(0).unsqueeze(-1).repeat(3, 1, 1).requires_grad_(True)
            fantasy_y = train_y[5:].unsqueeze(0).repeat(3, 1)
            fant_model = gp_model.get_fantasy_model(fantasy_x, fantasy_y, noise=noise[5:].unsqueeze(0).repeat(3, 1))
            fant_function_predictions = likelihood(fant_model(test_x), noise=test_noise)

            self.assertAllClose(test_function_predictions.mean, fant_function_predictions.mean[0], atol=1e-4)

            fant_function_predictions.mean.sum().backward()
            self.assertTrue(fantasy_x.grad is not None)
    def test_kissgp_gp_fast_pred_var(self):
        with gpytorch.settings.fast_pred_var(), gpytorch.settings.debug(False):
            train_x, train_y, test_x, test_y = make_data()
            likelihood = FixedNoiseGaussianLikelihood(torch.ones(100) * 0.001)
            gp_model = GPRegressionModel(train_x, train_y, likelihood)
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(
                likelihood, gp_model)

            # Optimize the model
            gp_model.train()
            likelihood.train()

            optimizer = optim.Adam(list(gp_model.parameters()) +
                                   list(likelihood.parameters()),
                                   lr=0.1)
            optimizer.n_iter = 0
            for _ in range(25):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

            # Test the model
            gp_model.eval()
            likelihood.eval()
            # Set the cache
            test_function_predictions = likelihood(gp_model(train_x))

            # Now bump up the likelihood to something huge
            # This will make it easy to calculate the variance
            likelihood.initialize(noise=3.)
            test_function_predictions = likelihood(gp_model(train_x))

            noise = likelihood.noise
            var_diff = (test_function_predictions.variance - noise).abs()
            self.assertLess(torch.max(var_diff / noise), 0.05)
    def test_posterior_latent_gp_and_likelihood_without_optimization(self, cuda=False):
        train_x, test_x, train_y, test_y = self._get_data(cuda=cuda)
        with gpytorch.settings.debug(False):
            # We're manually going to set the hyperparameters to be ridiculous
            likelihood = FixedNoiseGaussianLikelihood(torch.ones(11) * 1e-8)
            gp_model = ExactGPModel(train_x, train_y, likelihood)
            # Update lengthscale prior to accommodate extreme parameters
            gp_model.rbf_covar_module.initialize(lengthscale=exp(-6))
            gp_model.mean_module.initialize(constant=0)

            if cuda:
                gp_model.cuda()
                likelihood.cuda()

            # Compute posterior distribution
            gp_model.eval()
            likelihood.eval()

            # Let's see how our model does, conditioned with weird hyperparams
            # The posterior should fit all the data
            function_predictions = likelihood(gp_model(train_x))

            self.assertLess(torch.norm(function_predictions.mean - train_y), 1e-3)
            self.assertLess(torch.norm(function_predictions.variance), 5e-3)

            # It shouldn't fit much else though
            test_function_predictions = gp_model(torch.tensor([1.1]).type_as(test_x))

            self.assertLess(torch.norm(test_function_predictions.mean - 0), 1e-4)
            self.assertLess(torch.norm(test_function_predictions.variance - gp_model.covar_module.outputscale), 1e-4)
    def test_kissgp_gp_mean_abs_error_cuda(self):
        if not torch.cuda.is_available():
            return
        with least_used_cuda_device():
            train_x, train_y, test_x, test_y = make_data(cuda=True)
            likelihood = FixedNoiseGaussianLikelihood(torch.ones(100) *
                                                      0.001).cuda()
            gp_model = GPRegressionModel(train_x, train_y, likelihood).cuda()
            mll = gpytorch.mlls.ExactMarginalLogLikelihood(
                likelihood, gp_model)

            # Optimize the model
            gp_model.train()
            likelihood.train()

            optimizer = optim.Adam(list(gp_model.parameters()) +
                                   list(likelihood.parameters()),
                                   lr=0.1)
            optimizer.n_iter = 0
            with gpytorch.settings.debug(False):
                for _ in range(25):
                    optimizer.zero_grad()
                    output = gp_model(train_x)
                    loss = -mll(output, train_y)
                    loss.backward()
                    optimizer.n_iter += 1
                    optimizer.step()

                for param in gp_model.parameters():
                    self.assertTrue(param.grad is not None)
                    self.assertGreater(param.grad.norm().item(), 0)
                for param in likelihood.parameters():
                    self.assertTrue(param.grad is not None)
                    self.assertGreater(param.grad.norm().item(), 0)

                # Test the model
                gp_model.eval()
                likelihood.eval()
                test_preds = likelihood(gp_model(test_x)).mean
                mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

            self.assertLess(mean_abs_error.squeeze().item(), 0.02)
示例#9
0
    def test_kissgp_gp_mean_abs_error(self):
        # This test throws a warning because the fixed noise likelihood gets the wrong input
        warnings.simplefilter("ignore", GPInputWarning)

        train_x, train_y, test_x, test_y = make_data()
        likelihood = FixedNoiseGaussianLikelihood(torch.ones(100) * 0.001)
        gp_model = GPRegressionModel(train_x, train_y, likelihood)
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, gp_model)

        # Optimize the model
        gp_model.train()
        likelihood.train()

        optimizer = optim.Adam(list(gp_model.parameters()) +
                               list(likelihood.parameters()),
                               lr=0.1)
        optimizer.n_iter = 0
        with gpytorch.settings.debug(False):
            for _ in range(25):
                optimizer.zero_grad()
                output = gp_model(train_x)
                loss = -mll(output, train_y)
                loss.backward()
                optimizer.n_iter += 1
                optimizer.step()

            for param in gp_model.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)
            for param in likelihood.parameters():
                self.assertTrue(param.grad is not None)
                self.assertGreater(param.grad.norm().item(), 0)

            # Test the model
            gp_model.eval()
            likelihood.eval()

            test_preds = likelihood(gp_model(test_x)).mean
            mean_abs_error = torch.mean(torch.abs(test_y - test_preds))

        self.assertLess(mean_abs_error.squeeze().item(), 0.05)
示例#10
0
	def __init__(self, train_X: Tensor, train_Y: Tensor, options: dict, which_type: Optional[str] = "obj") -> None:

		# Error checking:
		assert train_Y.dim() == 1, "train_Y is required to be 1D"
		self._validate_tensor_args(X=train_X, Y=train_Y[:,None]) # Only for this function, train_Y must be 2D (this must be a bug in botorch)

		# Dimensionality of the input space:
		self.dim = train_X.shape[-1]

		# Model identity:
		self.iden = "GP_model_{0:s}".format(which_type)

		# Likelihood:
		noise_std = options["noise_std_obj"]
		lik = FixedNoiseGaussianLikelihood(noise=torch.full_like(train_Y, noise_std**2))

		# Initialize parent class:
		super().__init__(train_X, train_Y, lik)

		# Obtain hyperprior for lengthscale and outputscale:
		# NOTE: The mean (zero) and the model noise are fixed
		lengthscale_prior, outputscale_prior = extract_prior(options,which_type)

		# Initialize prior mean:
		# self.mean_module = ConstantMean()
		self.mean_module = ZeroMean()

		# Initialize covariance function:
		# base_kernel = RBFKernel(ard_num_dims=train_X.shape[-1],lengthscale_prior=GammaPrior(3.0, 6.0)) # original
		# self.covar_module = ScaleKernel(base_kernel=base_kernel,outputscale_prior=GammaPrior(2.0, 0.15)) # original
		base_kernel = RBFKernel(ard_num_dims=self.dim,lengthscale_prior=lengthscale_prior,lengthscale_constraint=GreaterThan(1e-2))
		self.covar_module = ScaleKernel(base_kernel=base_kernel,outputscale_prior=outputscale_prior)

		# Make sure we're on the right device/dtype
		self.to(train_X)

		# Instantiate the gradient model:
		self.model_grad = GPmodelWithGrad(dim=self.dim)
 def create_likelihood(self):
     noise = 0.1 + torch.rand(2, 3, 5)
     return FixedNoiseGaussianLikelihood(noise=noise)
示例#12
0
def run(obs, params_true, device='cpu'):
    device = safe_cast(torch.device, device)

    dx, NX = PARAM_DX, PARAM_MESH_RES_SPACE

    ts = torch.arange(PARAM_MESH_RES_TIME, device=device)

    priors_uniform = priors()

    y = torch.tensor(obs['Ss'], device=device)

    def simulate(params):
        _theta = {'a':params[0], 'b':params[1], 'k':params[2]}
        sim_pde = LandauCahnHilliard(
            params = _theta,
            M      = PARAM_DT,
            dx     = dx,
            device = device
        )
        loss_fn = Evaluator(sim_pde, loss)

        return loss_fn

    pgd = lhs(3, samples=PARAM_INIT_EVAL) # calculate initial samples from latin hypercube
    xs, ys = [],[]

    for j in range(PARAM_INIT_EVAL):
        xk = torch.stack(
            [(priors_uniform[k][1]-priors_uniform[k][0])*torch.tensor(pgd[j,i], device=device, dtype=torch.float32) + priors_uniform[k][0] for i,k in enumerate(('a','b','k'))],
            0
        )
        xs.append(xk)
#    ell, params = simulate(params)

    phi0 = (0.2 * torch.rand((NX, NX), device=device)).view(-1,1,NX,NX)

    with torch.no_grad():
        for j in range(PARAM_INIT_EVAL):
            params = xs[j]
            loss_fn = simulate(params)
            ys.append(loss_fn(phi0, ts, y, dx))

    x_init, y_init = torch.stack(xs), torch.stack(ys)
    print(y_init)
    N = PARAM_SEARCH_RES
    x_eval = torch.cat([x.reshape(-1,1) for x in torch.meshgrid(
        *[torch.linspace(priors_uniform[k][0], priors_uniform[k][1], N)\
            for k in priors_uniform]
    )],1)

    x_train = x_init
    y_train = y_init

    for i in range(PARAM_MAX_EVAL - PARAM_INIT_EVAL):
        for ntry in range(5):
            model = ExactGPModel(
                x_train, y_train,
                FixedNoiseGaussianLikelihood(
                    noise=1e-2*torch.ones(len(x_train))
                )
            )
            try:
                optimise(model, method='adam', max_iter=1000)
                break
            except Exception as err:
                print('attempt %d failed' % ntry)
                if ntry == 4:
                    raise err


        u = acq(y_train.min(), model, x_eval)
        xn = x_eval[u.argmax(),:]
        x_eval = torch.cat([x_eval[0:u.argmax(),:], x_eval[u.argmax()+1:,:]])
        # print(x_eval.shape)
        loss_fn = simulate(xn)
        yn = loss_fn(phi0, ts, y, dx)
        x_train = torch.cat([x_train, xn.reshape(1,-1)])
        y_train = torch.stack([*y_train, yn.detach()])
        print(i)

    return (x_train, y_train)
示例#13
0
    def __init__(self,
                 dim: int,
                 train_X: Tensor,
                 train_Y: Tensor,
                 options: dict,
                 which_type: Optional[str] = "obj") -> None:

        self.dim = dim

        if len(train_Y) == 0:  # No data case
            train_X = None
            train_Y = None
        else:
            # Error checking:
            assert train_Y.dim() == 1, "train_Y is required to be 1D"
            self._validate_tensor_args(
                X=train_X, Y=train_Y[:, None]
            )  # Only for this function, train_Y must be 2D (this must be a bug in botorch)

        print("\n")
        logger.info("### Initializing GP model for objective f(x) ###")

        # Likelihood:
        noise_std = options.hyperpars.noise_std.value
        if train_Y is not None:
            lik = FixedNoiseGaussianLikelihood(
                noise=torch.full_like(train_Y, noise_std**2))
        else:
            lik = FixedNoiseGaussianLikelihood(
                noise=torch.tensor([noise_std**2], device=device, dtype=dtype))

        # Initialize parent class:
        super().__init__(train_X, train_Y, lik)

        # # Obtain hyperprior for lengthscale and outputscale:
        # # NOTE: The mean (zero) and the model noise are fixed
        # lengthscale_prior, outputscale_prior = extract_prior(options.hyperpriors)

        # Initialize hyperpriors using scipy because gpytorch's gamma and beta distributions do not have the inverse CDF
        hyperpriors = dict(
            lengthscales=eval(options.hyperpars.lenthscales.prior),
            outputscale=eval(options.hyperpars.outputscale.prior))

        # Index hyperparameters:
        self.idx_hyperpars = dict(lengthscales=list(range(0, self.dim)),
                                  outputscale=[self.dim])
        self.dim_hyperpars = sum(
            [len(val) for val in self.idx_hyperpars.values()])

        # Get bounds:
        self.hyperpars_bounds = self._get_hyperparameters_bounds(hyperpriors)
        logger.info("hyperpars_bounds:" + str(self.hyperpars_bounds))

        # Initialize prior mean:
        # self.mean_module = ConstantMean()
        self.mean_module = ZeroMean()

        # Initialize covariance function:
        # base_kernel = RBFKernel(ard_num_dims=train_X.shape[-1],lengthscale_prior=GammaPrior(3.0, 6.0)) # original
        # self.covar_module = ScaleKernel(base_kernel=base_kernel,outputscale_prior=GammaPrior(2.0, 0.15)) # original
        # base_kernel = RBFKernel(ard_num_dims=self.dim,lengthscale_prior=lengthscale_prior,lengthscale_constraint=GreaterThan(1e-2))
        base_kernel = MaternKernel(nu=2.5,
                                   ard_num_dims=self.dim,
                                   lengthscale=0.1 * torch.ones(self.dim))
        self.covar_module = ScaleKernel(base_kernel=base_kernel)

        self.disp_info_scipy_opti = True
        # self.method = "L-BFGS-B"
        self.method = "LN_BOBYQA"
        # self.method = 'trust-constr'

        # Get a hyperparameter sample within bounds (not the same as sampling from the corresponding priors):
        hyperpars_sample = self._sample_hyperparameters_within_bounds(
            Nsamples=1).squeeze(0)
        self.covar_module.outputscale = hyperpars_sample[
            self.idx_hyperpars["outputscale"]]
        self.covar_module.base_kernel.lengthscale = hyperpars_sample[
            self.idx_hyperpars["lengthscales"]]
        self.noise_std = options.hyperpars.noise_std.value  # The evaluation noise is fixed, and given by the user

        # Initialize marginal log likelihood for the GPCR model.
        # mll_objective is callable
        # MLLGPCR can internally modify the model hyperparameters, and will do so throughout the optimization routine
        self.mll_objective = MLLGP(model_gp=self,
                                   likelihood_gp=self.likelihood,
                                   hyperpriors=hyperpriors)

        # Define nlopt optimizer:
        self.opti_hyperpars = OptimizationNonLinear(
            dim=self.dim_hyperpars,
            fun_obj=self.mll_objective,
            algo_str=self.method,
            tol_x=1e-4,
            Neval_max_local_optis=options.hyperpars.optimization.Nmax_evals,
            bounds=self.hyperpars_bounds,
            what2optimize_str="GP hyperparameters")

        # Make sure we're on the right device/dtype
        if train_Y is not None:
            self.to(train_X)

        self.Nrestarts = options.hyperpars.optimization.Nrestarts

        self._update_hyperparameters()

        self.eval()
示例#14
0
def main(args):
    if args.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
    else:
        device = torch.device("cpu")

    init_dict, train_dict, test_dict = prepare_data(args.data_loc,
                                                    args.num_init,
                                                    args.num_total,
                                                    test_is_year=False,
                                                    seed=args.seed)
    init_x, init_y, init_y_var = (
        init_dict["x"].to(device),
        init_dict["y"].to(device),
        init_dict["y_var"].to(device),
    )
    train_x, train_y, train_y_var = (
        train_dict["x"].to(device),
        train_dict["y"].to(device),
        train_dict["y_var"].to(device),
    )
    test_x, test_y, test_y_var = (
        test_dict["x"].to(device),
        test_dict["y"].to(device),
        test_dict["y_var"].to(device),
    )

    likelihood = FixedNoiseGaussianLikelihood(noise=init_y_var)
    grid_pts = create_grid(grid_sizes=[30, 30],
                           grid_bounds=torch.tensor([[0., 1.], [0., 1.]]))
    induc_points = torch.cat(
        [x.reshape(-1, 1) for x in torch.meshgrid(grid_pts)], dim=-1)

    model = VariationalGPModel(
        inducing_points=induc_points,
        mean_module=gpytorch.means.ZeroMean(),
        covar_module=ScaleKernel(
            MaternKernel(
                ard_num_dims=2,
                nu=0.5,
                lengthscale_prior=GammaPrior(3.0, 6.0),
            ),
            outputscale_prior=GammaPrior(2.0, 0.15),
        ),
        streaming=True,
        likelihood=likelihood,
        beta=args.beta,
        learn_inducing_locations=args.learn_inducing,
    ).to(device)
    mll = VariationalELBO(model.likelihood,
                          model,
                          beta=args.beta,
                          num_data=args.num_init)

    print("---- Fitting initial model ----")
    start = time.time()
    model.train()
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=10 * args.lr_init)
    model, loss = fit_variational_model(mll,
                                        model,
                                        optimizer,
                                        init_x,
                                        init_y,
                                        maxiter=1000)
    end = time.time()
    print("Elapsed fitting time: ", end - start)

    print("--- Now computing initial RMSE")
    model.eval()
    with gpytorch.settings.skip_posterior_variances(True):
        test_pred = model(test_x)
        pred_rmse = ((test_pred.mean - test_y)**2).mean().sqrt()

    print("---- Initial RMSE: ", pred_rmse.item())

    all_outputs = []
    start_ind = init_x.shape[0]
    end_ind = int(start_ind + args.batch_size)

    current_x = init_x
    current_y = init_y
    current_y_var = init_y_var

    for step in range(args.num_steps):
        if step > 0 and step % 25 == 0:
            print("Beginning step ", step)

        total_time_step_start = time.time()

        if step > 0:
            print("---- Fitting model ----")
            start = time.time()
            model.train()
            model.zero_grad()
            model.likelihood = FixedNoiseGaussianLikelihood(current_y_var)
            mll = VariationalELBO(model.likelihood,
                                  model,
                                  beta=args.beta,
                                  num_data=args.num_init)
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=args.lr_init * 0.99**step)
            model, loss = fit_variational_model(mll,
                                                model,
                                                optimizer,
                                                current_x,
                                                current_y,
                                                maxiter=300)

            model.zero_grad()
            end = time.time()
            print("Elapsed fitting time: ", end - start)
            # print("Named parameters: ", list(model.named_parameters()))

        if args.acqf == "max_post_var" and not args.random:
            candidates, acq_value = generate_candidates(model,
                                                        args.batch_size,
                                                        device,
                                                        maxiter=300)
        elif args.acqf == "max_test_var" and not args.random:
            model.eval()
            vals, inds = model(test_x).variance.sort()
            acq_value = vals[-args.batch_size:].mean().detach()
            candidates = test_x[inds[-args.batch_size:]]
        else:
            candidates = torch.rand(args.batch_size,
                                    train_x.shape[-1],
                                    device=device,
                                    dtype=train_x.dtype)
            acq_value = torch.zeros(1)
            model.eval()
            _ = model(test_x[:10])  # to init caches

        print("---- Finished optimizing; now querying dataset ---- ")
        with torch.no_grad():
            covar_dists = model.covar_module(candidates, train_x)
            nearest_points = covar_dists.evaluate().argmax(dim=-1)
            new_x = train_x[nearest_points]
            new_y = train_y[nearest_points]
            new_y_var = train_y_var[nearest_points]

            todrop = torch.tensor(
                [x in nearest_points for x in range(train_x.shape[0])])
            train_x, train_y, train_y_var = train_x[~todrop], train_y[
                ~todrop], train_y_var[~todrop]
            print("New train_x shape", train_x.shape)
            print("--- Now updating model with simulator ----")
            current_x = torch.cat((current_x, new_x), dim=0)
            current_y = torch.cat((current_y, new_y), dim=0)
            current_y_var = torch.cat((current_y_var, new_y_var), dim=0)

        print("--- Now computing updated RMSE")
        model.eval()
        test_pred = model(test_x)
        pred_rmse = ((test_pred.mean.view(-1) -
                      test_y.view(-1))**2).mean().sqrt()
        pred_avg_variance = test_pred.variance.mean()

        total_time_step_elapsed_time = time.time() - total_time_step_start
        step_output_list = [
            total_time_step_elapsed_time,
            acq_value.item(),
            pred_rmse.item(),
            pred_avg_variance.item(),
            loss.item()
        ]
        print("Step RMSE: ", pred_rmse)
        all_outputs.append(step_output_list)

        start_ind = end_ind
        end_ind = int(end_ind + args.batch_size)

    output_dict = {
        "model_state_dict": model.cpu().state_dict(),
        "queried_points": {
            'x': current_x,
            'y': current_y
        },
        "results": DataFrame(all_outputs)
    }
    torch.save(output_dict, args.output)