def __init__(self, beta_min, beta_prior=None, **kwargs): """ Initialisation. Parameters ---------- :param beta_min: minimum value of the inverse square lengthscale parameter beta Optional parameters ------------------- :param beta_prior: prior on the parameter beta :param kwargs: additional arguments """ super(SphereGaussianKernel, self).__init__(has_lengthscale=False, **kwargs) self.beta_min = beta_min # Add beta parameter, corresponding to the inverse of the lengthscale parameter. beta_num_dims = 1 self.register_parameter(name="raw_beta", parameter=torch.nn.Parameter( torch.zeros(*self.batch_shape, 1, beta_num_dims))) if beta_prior is not None: self.register_prior("beta_prior", beta_prior, lambda: self.beta, lambda v: self._set_beta(v)) # A GreaterThan constraint is defined on the lengthscale parameter to guarantee positive-definiteness. # The value of beta_min can be determined e.g. experimentally. self.register_constraint("raw_beta", GreaterThan(self.beta_min))
def __init__(self, test_data, args): # data buffer, only store training data, test_data will only be stored in GP model before the model is trained self.n = 0 self.data = None self.index_list = [] self.norm = 1.0 self.previous_loss = CUDA(torch.tensor(np.inf)) self.trigger_training = CUDA(torch.tensor(1e-3)) self.lr = args.lr self.state_dim = args.state_dim self.action_dim = args.action_dim self.input_dim = self.state_dim + self.action_dim self.gp_iter = args.gp_iter # prior of the kernel parameters # [NOTE] these prior parameters should be similar to the estimated parameters of real data # if lengthscale is too large, it will be too difficult to create new components # if lengthscale is too small, it will be too esay to create new components # if noise_covar is too large, the prediction will be inaccurate # if noise_covar is too small, the conjugate gradient will not converge, and the prediction will be improve if too small self.param = [ 1e-5, # noise_covar initilize and constraint 0.0, # constant initilize 0.7, # outputscale initilize 1.0, # [lengthscale initilize] 100.0, # lengthscale_constraint 0.0001 # outputscale_constraint ] self.param = CUDA(torch.tensor(self.param)) # initialize model and likelihood model_list = [] likelihood_list = [] for m_i in range(self.state_dim): likelihood = CUDA( gpytorch.likelihoods.GaussianLikelihood( noise_constraint=GreaterThan(self.param[0]))) model = CUDA( ExactGPR(None, None, likelihood, self.input_dim, self.param)) model.reset_parameters() likelihood_list.append(model.likelihood) model_list.append(model) # initialize model list self.model = gpytorch.models.IndependentModelList(*model_list) self.likelihood = gpytorch.likelihoods.LikelihoodList(*likelihood_list) # initialize optimizer self.optimizer = torch.optim.Adam([{ 'params': self.model.parameters() }], lr=self.lr) self.mll = gpytorch.mlls.SumMarginalLogLikelihood( self.likelihood, self.model) # change the flag self.model.eval() self.likelihood.eval()
def load_mcmc_samples(self, mcmc_samples: Dict[str, Tensor]) -> None: r"""Load the MCMC hyperparameter samples into the model. This method will be called by `fit_fully_bayesian_model_nuts` when the model has been fitted in order to create a batched SingleTaskGP model. """ tkwargs = {"device": self.train_X.device, "dtype": self.train_X.dtype} num_mcmc_samples = len(mcmc_samples["mean"]) batch_shape = torch.Size([num_mcmc_samples]) self.train_X = self.train_X.unsqueeze(0).expand( num_mcmc_samples, self.train_X.shape[0], -1 ) self.mean_module = ConstantMean(batch_shape=batch_shape).to(**tkwargs) self.covar_module = ScaleKernel( base_kernel=MaternKernel( ard_num_dims=self.train_X.shape[-1], batch_shape=batch_shape, ), batch_shape=batch_shape, ).to(**tkwargs) if self.train_Yvar is not None: self.likelihood = FixedNoiseGaussianLikelihood( noise=self.train_Yvar, batch_shape=batch_shape ).to(**tkwargs) else: self.likelihood = GaussianLikelihood( batch_shape=batch_shape, noise_constraint=GreaterThan(MIN_INFERRED_NOISE_LEVEL), ).to(**tkwargs) self.likelihood.noise_covar.noise = ( mcmc_samples["noise"] .detach() .clone() .view(self.likelihood.noise_covar.noise.shape) .clamp_min(MIN_INFERRED_NOISE_LEVEL) .to(**tkwargs) ) self.covar_module.base_kernel.lengthscale = ( mcmc_samples["lengthscale"] .detach() .clone() .view(self.covar_module.base_kernel.lengthscale.shape) .to(**tkwargs) ) self.covar_module.outputscale = ( mcmc_samples["outputscale"] .detach() .clone() .view(self.covar_module.outputscale.shape) .to(**tkwargs) ) self.mean_module.constant.data = ( mcmc_samples["mean"] .detach() .clone() .view(self.mean_module.constant.shape) .to(**tkwargs) )
def test_module_bounds(self, cuda=False): device = torch.device("cuda") if cuda else torch.device("cpu") for dtype in (torch.float, torch.double): # get a test module train_x = torch.tensor([[1.0, 2.0, 3.0]], device=device, dtype=dtype) train_y = torch.tensor([4.0], device=device, dtype=dtype) likelihood = GaussianLikelihood( noise_constraint=GreaterThan(1e-5, transform=None) ) model = ExactGP(train_x, train_y, likelihood) model.covar_module = RBFKernel(ard_num_dims=3) model.mean_module = ConstantMean() model.to(device=device, dtype=dtype) mll = ExactMarginalLogLikelihood(likelihood, model) # test the basic case x, pdict, bounds = module_to_array( module=mll, bounds={"model.covar_module.raw_lengthscale": (0.1, None)} ) self.assertTrue(np.array_equal(x, np.zeros(5))) expected_sizes = { "likelihood.noise_covar.raw_noise": torch.Size([1]), "model.covar_module.raw_lengthscale": torch.Size([1, 3]), "model.mean_module.constant": torch.Size([1]), } self.assertEqual(set(pdict.keys()), set(expected_sizes.keys())) for pname, val in pdict.items(): self.assertEqual(val.dtype, dtype) self.assertEqual(val.shape, expected_sizes[pname]) self.assertEqual(val.device.type, device.type) lower_exp = np.full_like(x, 0.1) lower_exp[_get_index(pdict, "model.mean_module.constant")] = -np.inf lower_exp[_get_index(pdict, "likelihood.noise_covar.raw_noise")] = 1e-5 self.assertTrue(np.allclose(bounds[0], lower_exp)) self.assertTrue(np.equal(bounds[1], np.full_like(x, np.inf)).all())
def test_fit_gpytorch_model_singular(self): options = {"disp": False, "maxiter": 5} for dtype in (torch.float, torch.double): X_train = torch.ones(2, 2, device=self.device, dtype=dtype) Y_train = torch.zeros(2, 1, device=self.device, dtype=dtype) test_likelihood = GaussianLikelihood( noise_constraint=GreaterThan(-1e-7, transform=None, initial_value=0.0) ) gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) # this will do multiple retries (and emit warnings, which is desired) with warnings.catch_warnings(record=True) as ws, settings.debug(True): fit_gpytorch_model(mll, options=options, max_retries=2) self.assertTrue( any(issubclass(w.category, NumericalWarning) for w in ws) ) # ensure that we fail if noise ensures that jitter does not help gp.likelihood = GaussianLikelihood( noise_constraint=Interval(-2, -1, transform=None, initial_value=-1.5) ) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) with self.assertRaises(NotPSDError): fit_gpytorch_model(mll, options=options, max_retries=2) # ensure we can handle NaNErrors in the optimizer with mock.patch.object(SingleTaskGP, "__call__", side_effect=NanError): gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) fit_gpytorch_model( mll, options={"disp": False, "maxiter": 1}, max_retries=1 )
def test_fit_gpytorch_model_singular(self): options = {"disp": False, "maxiter": 5} for dtype in (torch.float, torch.double): X_train = torch.ones(2, 2, device=self.device, dtype=dtype) Y_train = torch.zeros(2, 1, device=self.device, dtype=dtype) test_likelihood = GaussianLikelihood( noise_constraint=GreaterThan(-1e-7, transform=None, initial_value=0.0) ) gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) # this will do multiple retries (and emit warnings, which is desired) with warnings.catch_warnings(record=True) as ws, settings.debug(True): fit_gpytorch_model(mll, options=options, max_retries=2) self.assertTrue( any(issubclass(w.category, NumericalWarning) for w in ws) ) # ensure that we fail if noise ensures that jitter does not help gp.likelihood = GaussianLikelihood( noise_constraint=Interval(-2, -1, transform=None, initial_value=-1.5) ) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) with self.assertLogs(level="DEBUG") as logs: fit_gpytorch_model(mll, options=options, max_retries=2) self.assertTrue(any("NotPSDError" in log for log in logs.output)) # ensure we can handle NaNErrors in the optimizer with mock.patch.object(SingleTaskGP, "__call__", side_effect=NanError): gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) fit_gpytorch_model( mll, options={"disp": False, "maxiter": 1}, max_retries=1 ) # ensure we catch NotPSDErrors with mock.patch.object(SingleTaskGP, "__call__", side_effect=NotPSDError): mll = self._getModel() with self.assertLogs(level="DEBUG") as logs: fit_gpytorch_model(mll, max_retries=2) for retry in [1, 2]: self.assertTrue( any( f"Fitting failed on try {retry} due to a NotPSDError." in log for log in logs.output ) ) # Failure due to optimization warning def optimize_w_warning(mll, **kwargs): warnings.warn("Dummy warning.", OptimizationWarning) return mll, None mll = self._getModel() with self.assertLogs(level="DEBUG") as logs, settings.debug(True): fit_gpytorch_model(mll, optimizer=optimize_w_warning, max_retries=2) self.assertTrue( any("Fitting failed on try 1." in log for log in logs.output) )
def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor, ddp_space: BoxSpace, num_restarts: int, num_samples: int) -> to.Tensor: """ Compute the GP input with the maximal posterior mean. :param cands: candidates a.k.a. x :param cands_values: observed values a.k.a. y :param ddp_space: space of the domain distribution parameters, indicates the lower and upper bound :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :return: un-normalized candidate with maximum posterior value a.k.a. x """ if not isinstance(cands, to.Tensor): raise pyrado.TypeErr(given=cands, expected_type=to.Tensor) if not isinstance(cands_values, to.Tensor): raise pyrado.TypeErr(given=cands_values, expected_type=to.Tensor) if not isinstance(ddp_space, BoxSpace): raise pyrado.TypeErr(given=ddp_space, expected_type=BoxSpace) # Normalize the input data and standardize the output data uc_projector = UnitCubeProjector( to.from_numpy(ddp_space.bound_lo).to(dtype=to.get_default_dtype()), to.from_numpy(ddp_space.bound_up).to(dtype=to.get_default_dtype()), ) cands_norm = uc_projector.project_to(cands) cands_values_stdized = standardize(cands_values) if cands_norm.shape[0] > cands_values.shape[0]: print_cbt( f"There are {cands.shape[0]} candidates but only {cands_values.shape[0]} evaluations. Ignoring " f"the candidates without evaluation for computing the argmax.", "y", ) cands_norm = cands_norm[:cands_values.shape[0], :] # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint("raw_noise", GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) # Find position with maximal posterior mean cand_norm, _ = optimize_acqf( acq_function=PosteriorMean(gp), bounds=to.stack( [to.zeros(ddp_space.flat_dim), to.ones(ddp_space.flat_dim)]).to(dtype=to.float32), q=1, num_restarts=num_restarts, raw_samples=num_samples, ) cand_norm = cand_norm.to(dtype=to.get_default_dtype()) cand = uc_projector.project_back(cand_norm.detach()) print_cbt(f"Converged to argmax of the posterior mean: {cand.numpy()}", "g", bright=True) return cand
def __init__(self, train_x, train_y, likelihood, input_dim, params): super(ExactGPR, self).__init__(train_x, train_y, likelihood) self.mean_module = gpytorch.means.ConstantMean() self.covar_module = gpytorch.kernels.ScaleKernel( gpytorch.kernels.RBFKernel(ard_num_dims=input_dim, lengthscale_constraint=LessThan( params[0])), outputscale_constraint=GreaterThan(params[1]))
def fit_model(self): """ If no state_dict exists, fits the model and saves the state_dict. Otherwise, constructs the model but uses the fit given by the state_dict. """ # read the data data_list = list() for i in range(1, 31): data_file = os.path.join(script_dir, "port_evals", "port_n=100_seed=%d" % i) data_list.append(torch.load(data_file)) # join the data together X = torch.cat([data_list[i]["X"] for i in range(len(data_list))], dim=0).squeeze(-2) Y = torch.cat([data_list[i]["Y"] for i in range(len(data_list))], dim=0).squeeze(-2) # fit GP noise_prior = GammaPrior(1.1, 0.5) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=[], noise_constraint=GreaterThan( 0.000005, # minimum observation noise assumed in the GP model transform=None, initial_value=noise_prior_mode, ), ) # We save the state dict to avoid fitting the GP every time which takes ~3 mins try: state_dict = torch.load( os.path.join(script_dir, "portfolio_surrogate_state_dict.pt")) model = SingleTaskGP(X, Y, likelihood, outcome_transform=Standardize(m=1)) model.load_state_dict(state_dict) except FileNotFoundError: model = SingleTaskGP(X, Y, likelihood, outcome_transform=Standardize(m=1)) mll = ExactMarginalLogLikelihood(model.likelihood, model) from time import time start = time() fit_gpytorch_model(mll) print("fitting took %s seconds" % (time() - start)) torch.save( model.state_dict(), os.path.join(script_dir, "portfolio_surrogate_state_dict.pt"), ) self.model = model
def __init__(self, input_dim, feature_dim, label_dim, hidden_width, hidden_depth, n_inducing, batch_size, max_epochs_since_update, **kwargs): """ Args: input_dim (int) feature_dim (int): dimension of deep kernel features label_dim (int) hidden_depth (int) hidden_width (int or list) n_inducing (int): number of inducing points for variational approximation batch_size (int) max_epochs_since_update (int) """ params = locals() del params['self'] self.__dict__ = params super().__init__() noise_constraint = GreaterThan(1e-4) self.likelihood = GaussianLikelihood(batch_shape=torch.Size( [label_dim]), noise_constraint=noise_constraint) self.nn = FCNet(input_dim, output_dim=label_dim, hidden_width=hidden_width, hidden_depth=hidden_depth, batch_norm=True) self.batch_norm = torch.nn.BatchNorm1d(feature_dim) self.mean_module = ConstantMean(batch_shape=torch.Size([label_dim])) base_kernel = RBFKernel(batch_shape=torch.Size([label_dim]), ard_num_dims=feature_dim) self.covar_module = ScaleKernel(base_kernel, batch_shape=torch.Size([label_dim])) variational_dist = MeanFieldVariationalDistribution( num_inducing_points=n_inducing, batch_shape=torch.Size([label_dim])) inducing_points = torch.randn(n_inducing, feature_dim) self.variational_strategy = VariationalStrategy( self, inducing_points, variational_dist, learn_inducing_locations=True) # initialize preprocessers self.register_buffer("input_mean", torch.zeros(input_dim)) self.register_buffer("input_std", torch.ones(input_dim)) self.register_buffer("label_mean", torch.zeros(label_dim)) self.register_buffer("label_std", torch.ones(label_dim)) self._train_ckpt = deepcopy(self.state_dict()) self._eval_ckpt = deepcopy(self.state_dict())
def fit(self, x_train, y_train): # normalize parameter (=input) data x_train_norm = self.param_normalizer.project_to(x_train) # normalize the data y_train_norm = self.data_normalizer.standardize(y_train) self.gp = SingleTaskGP(x_train_norm, y_train_norm) self.gp.likelihood.noise_covar.register_constraint( "raw_noise", GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(self.gp.likelihood, self.gp) fit_gpytorch_model(mll) return self.gp
def cont_kernel_factory( batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int], ) -> MaternKernel: return MaternKernel( nu=2.5, batch_shape=batch_shape, ard_num_dims=ard_num_dims, active_dims=active_dims, lengthscale_constraint=GreaterThan(1e-04), )
def test_fit_gpytorch_model_singular(self, cuda=False): options = {"disp": False, "maxiter": 5} device = torch.device("cuda") if cuda else torch.device("cpu") for dtype in (torch.float, torch.double): X_train = torch.rand(2, 2, device=device, dtype=dtype) Y_train = torch.zeros(2, device=device, dtype=dtype) test_likelihood = GaussianLikelihood(noise_constraint=GreaterThan( -1.0, transform=None, initial_value=0.0)) gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=device, dtype=dtype) # this will do multiple retries (and emit warnings, which is desired) fit_gpytorch_model(mll, options=options, max_retries=2)
def __init__(self, train_x, train_y, likelihood, input_dim, params): super(SparseGPR, self).__init__(train_x, train_y, likelihood) self.mean_module = gpytorch.means.ConstantMean() self.covar_module = gpytorch.kernels.ScaleKernel( gpytorch.kernels.RBFKernel(ard_num_dims=input_dim, lengthscale_constraint=LessThan( params[0])), outputscale_constraint=GreaterThan(params[1])) # use some training data to initialize the inducing_module if train_x is None: train_x = CUDA(torch.zeros((1, input_dim))) self.inducing_module = gpytorch.kernels.InducingPointKernel( self.covar_module, inducing_points=train_x, likelihood=likelihood)
def __init__(self, dim, latent_dim, beta_min, beta_prior=None, **kwargs): """ Initialisation. Parameters ---------- :param dim: dimension of the ambient high-dimensional sphere manifold :param latent_dim: dimension of the latent low-dimensional sphere manifold :param beta_min: minimum value of the inverse square lengthscale parameter beta Optional parameters ------------------- :param beta_prior: prior on the parameter beta :param kwargs: additional arguments """ super(NestedSphereGaussianKernel, self).__init__(has_lengthscale=False, **kwargs) self.beta_min = beta_min self.dim = dim self.latent_dim = latent_dim # Add beta parameter, corresponding to the inverse of the lengthscale parameter. beta_num_dims = 1 self.register_parameter(name="raw_beta", parameter=torch.nn.Parameter(torch.zeros(*self.batch_shape, 1, beta_num_dims))) if beta_prior is not None: self.register_prior("beta_prior", beta_prior, lambda: self.beta, lambda v: self._set_beta(v)) # A GreaterThan constraint is defined on the lengthscale parameter to guarantee positive-definiteness. # The value of beta_min can be determined e.g. experimentally. self.register_constraint("raw_beta", GreaterThan(self.beta_min)) # Add projection parameters for d in range(self.dim, self.latent_dim, -1): # Axes parameters # Register axis_name = "raw_axis_S" + str(d) # axis = torch.zeros(1, d) # axis[:, 0] = 1 axis = torch.randn(1, d) axis = axis / torch.norm(axis) axis = axis.repeat(*self.batch_shape, 1, 1) self.register_parameter(name=axis_name, parameter=torch.nn.Parameter(axis)) # Corresponding manifold axis_manifold_name = "raw_axis_S" + str(d) + "_manifold" setattr(self, axis_manifold_name, pyman_man.Sphere(d)) # Distance to axis (constant), fixed at pi/2 self.distances_to_axis = [np.pi/2 *torch.ones(1, 1) for d in range(self.dim, self.latent_dim, -1)]
def _sample(self, candidates: Optional[np.array] = None) -> np.array: if len(self.X_observed) < self.num_initial_random_draws: return self.initial_sampler.sample(candidates=candidates) else: z_observed = torch.Tensor(self.transform_outputs(self.y_observed.numpy())) with torch.no_grad(): # both (n, 1) #mu_pred, sigma_pred = self.thompson_sampling.prior(self.X_observed) mu_pred, sigma_pred = self.initial_sampler.prior.predict(self.X_observed) mu_pred = torch.Tensor(mu_pred) sigma_pred = torch.Tensor(sigma_pred) # (n, 1) r_observed = residual_transform(z_observed, mu_pred, sigma_pred) # build and fit GP on residuals gp = SingleTaskGP( train_X=self.X_observed, train_Y=r_observed, likelihood=GaussianLikelihood(noise_constraint=GreaterThan(1e-3)), ) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) acq = ShiftedExpectedImprovement( model=gp, best_f=z_observed.min(dim=0).values, mean_std_predictor=self.initial_sampler.prior.predict, maximize=False, ) if candidates is None: candidate, acq_value = optimize_acqf( acq, bounds=self.bounds_tensor, q=1, num_restarts=5, raw_samples=100, ) # import matplotlib.pyplot as plt # x = torch.linspace(-1, 1).unsqueeze(dim=-1) # x = torch.cat((x, x * 0), dim=1) # plt.plot(x[:, 0].flatten().tolist(), acq(x.unsqueeze(dim=1)).tolist()) # plt.show() return candidate[0] else: # (N,) ei = acq(torch.Tensor(candidates).unsqueeze(dim=-2)) return torch.Tensor(candidates[ei.argmax()])
def initialize_model(x, z, state_dict=None): n = z.shape[-1] gp_models = [] for i in range(n): y = z[..., i].unsqueeze(-1) gp_model = SingleTaskGP(train_X=x, train_Y=y) gp_model.likelihood.noise_covar.register_constraint( "raw_noise", GreaterThan(1e-5)) gp_models.append(gp_model) model_list = ModelListGP(*gp_models) mll = SumMarginalLogLikelihood(model_list.likelihood, model_list) if state_dict is not None: model_list.load_state_dict(state_dict) return mll, model_list
def __init__(self, train_x, train_y, likelihood, input_dim, params): super(ExactGPR, self).__init__(train_x, train_y, likelihood) self.params = params self.input_dim = input_dim self.lengthscale_prior = None #gpytorch.priors.GammaPrior(3.0, 6.0) self.outputscale_prior = None #gpytorch.priors.GammaPrior(2.0, 0.15) self.mean_module = gpytorch.means.ConstantMean() self.covar_module = gpytorch.kernels.ScaleKernel( gpytorch.kernels.RBFKernel( ard_num_dims=input_dim, lengthscale_prior=self.lengthscale_prior, lengthscale_constraint=LessThan(self.params[4])), outputscale_prior=self.outputscale_prior, outputscale_constraint=GreaterThan(self.params[5]))
def test_fit_gpytorch_model_singular(self): options = {"disp": False, "maxiter": 5} for dtype in (torch.float, torch.double): X_train = torch.rand(2, 2, device=self.device, dtype=dtype) Y_train = torch.zeros(2, 1, device=self.device, dtype=dtype) test_likelihood = GaussianLikelihood( noise_constraint=GreaterThan(-1.0, transform=None, initial_value=0.0) ) gp = SingleTaskGP(X_train, Y_train, likelihood=test_likelihood) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.to(device=self.device, dtype=dtype) # this will do multiple retries (and emit warnings, which is desired) with warnings.catch_warnings(record=True) as ws, settings.debug(True): fit_gpytorch_model(mll, options=options, max_retries=2) self.assertTrue( any(issubclass(w.category, OptimizationWarning) for w in ws) )
def __init__(self, dim, latent_dim, beta_min, beta_prior=None, **kwargs): """ Initialisation. Parameters ---------- :param dim: dimension of the ambient high-dimensional sphere manifold :param latent_dim: dimension of the latent low-dimensional sphere manifold :param beta_min: minimum value of the inverse square lengthscale parameter beta :param beta_prior: prior on the parameter beta :param kwargs: additional arguments """ super(NestedSpdAffineInvariantGaussianKernel, self).__init__(has_lengthscale=False, **kwargs) self.beta_min = beta_min self.dim = dim self.latent_dim = latent_dim # Add beta parameter, corresponding to the inverse of the lengthscale parameter. beta_num_dims = 1 self.register_parameter(name="raw_beta", parameter=torch.nn.Parameter( torch.zeros(*self.batch_shape, 1, beta_num_dims))) if beta_prior is not None: self.register_prior("beta_prior", beta_prior, lambda: self.beta, lambda v: self._set_beta(v)) # A GreaterThan constraint is defined on the lengthscale parameter to guarantee the positive-definiteness of the # kernel. # The value of beta_min can be determined e.g. experimentally. self.register_constraint("raw_beta", GreaterThan(self.beta_min)) # Add projection parameters self.raw_projection_matrix_manifold = pyman_man.Grassmann( self.dim, self.latent_dim) self.register_parameter( name="raw_projection_matrix", parameter=torch.nn.Parameter( torch.Tensor( self.raw_projection_matrix_manifold.rand()).repeat( *self.batch_shape, 1, 1)))
def argmax_posterior_mean(cands: to.Tensor, cands_values: to.Tensor, uc_normalizer: UnitCubeProjector, num_restarts: int, num_samples: int) -> to.Tensor: """ Compute the GP input with the maximal posterior mean. :param cands: candidates a.k.a. x :param cands_values: observed values a.k.a. y :param uc_normalizer: unit cube normalizer used during the experiments (can be recovered form the bounds) :param num_restarts: number of restarts for the optimization of the acquisition function :param num_samples: number of samples for the optimization of the acquisition function :return: un-normalized candidate with maximum posterior value a.k.a. x """ # Normalize the input data and standardize the output data cands_norm = uc_normalizer.project_to(cands) cands_values_stdized = standardize(cands_values) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) # Find position with maximal posterior mean cand_norm, acq_value = optimize_acqf( acq_function=PosteriorMean(gp), bounds=to.stack([ to.zeros_like(uc_normalizer.bound_lo), to.ones_like(uc_normalizer.bound_up) ]), q=1, num_restarts=num_restarts, raw_samples=num_samples) cand = uc_normalizer.project_back(cand_norm.detach()) print_cbt(f'Converged to argmax of the posterior mean\n{cand.numpy()}', 'g', bright=True) return cand
def _sample(self, candidates: Optional[np.array] = None) -> np.array: if len(self.X_observed) < self.num_initial_random_draws: return self.initial_sampler.sample(candidates=candidates) else: z_observed = torch.Tensor( self.transform_outputs(self.y_observed.numpy())) # build and fit GP gp = SingleTaskGP( train_X=self.X_observed, train_Y=z_observed, # special likelihood for numerical Cholesky errors, following advice from # https://www.gitmemory.com/issue/pytorch/botorch/179/506276521 likelihood=GaussianLikelihood( noise_constraint=GreaterThan(1e-3)), ) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) acq = self.expected_improvement( model=gp, best_f=z_observed.min(dim=0).values, ) if candidates is None: candidate, acq_value = optimize_acqf( acq, bounds=self.bounds_tensor, q=1, num_restarts=5, raw_samples=100, ) return candidate[0] else: # (N,) ei = acq(torch.Tensor(candidates).unsqueeze(dim=-2)) return torch.Tensor(candidates[ei.argmax()])
def __init__( self, datapoints: Tensor, comparisons: Tensor, covar_module: Optional[Module] = None, noise_module: Optional[HomoskedasticNoise] = None, **kwargs, ) -> None: super().__init__() r"""A probit-likelihood GP with Laplace approximation model. A probit-likelihood GP with Laplace approximation model that learns via pairwise comparison data. By default it uses a scaled-RBF kernel. Args: datapoints: A `batch_shape x n x d` tensor of training features. comparisons: A `batch_shape x m x 2` training comparisons; comparisons[i] is a noisy indicator suggesting the utility value of comparisons[i, 0]-th is greater than comparisons[i, 1]-th. covar_module: Covariance module noise_module: Noise module """ # Compatibility variables with fit_gpytorch_*: Dummy likelihood # Likelihood is tightly tied with this model and # it doesn't make much sense to keep it separate self.likelihood = None # TODO: remove these variables from `state_dict()` so that when calling # `load_state_dict()`, only the hyperparameters are copied over self.register_buffer("datapoints", None) self.register_buffer("comparisons", None) self.register_buffer("utility", None) self.register_buffer("covar_chol", None) self.register_buffer("likelihood_hess", None) self.register_buffer("hlcov_eye", None) self.register_buffer("covar", None) self.register_buffer("covar_inv", None) self.train_inputs = [] self.train_targets = None self.pred_cov_fac_need_update = True self._input_batch_shape = torch.Size() self.dim = None # will be set to match datapoints' dtype and device # since scipy.optimize.fsolve only works on cpu, it'd be the # fastest to fit the model on cpu and take samples on gpu to avoid # overhead of moving data back and forth during fitting time self.tkwargs = {} # See set_train_data for additional compatibility variables self.set_train_data(datapoints, comparisons, update_model=False) # Set optional parameters # jitter to add for numerical stability self._jitter = kwargs.get("jitter", 1e-6) # Clamping z lim for better numerical stability. See self._calc_z for detail # norm_cdf(z=3) ~= 0.999, top 0.1% percent self._zlim = kwargs.get("zlim", 3) # Stopping creteria in scipy.optimize.fsolve used to find f_map in _update() # If None, set to 1e-6 by default in _update self._xtol = kwargs.get("xtol") # The maximum number of calls to the function in scipy.optimize.fsolve # If None, set to 100 by default in _update # If zero, then 100*(N+1) is used by default by fsolve; self._maxfev = kwargs.get("maxfev") # Set hyperparameters # Do not set the batch_shape explicitly so mean_module can operate in both mode # once fsolve used in _update can run in batch mode, we should explicitly set # the bacth shape here self.mean_module = ConstantMean() # Do not optimize constant mean prior for param in self.mean_module.parameters(): param.requires_grad = False # set covariance module if noise_module is None: noise_module = HomoskedasticNoise( noise_prior=SmoothedBoxPrior(-5, 5, 0.5, transform=torch.log), noise_constraint=GreaterThan(1e-4), # if None, 1e-4 by default batch_shape=self._input_batch_shape, ) self.noise_module = noise_module # set covariance module if covar_module is None: ls_prior = GammaPrior(1.2, 0.5) ls_prior_mode = (ls_prior.concentration - 1) / ls_prior.rate covar_module = RBFKernel( batch_shape=self._input_batch_shape, ard_num_dims=self.dim, lengthscale_prior=ls_prior, lengthscale_constraint=Positive(transform=None, initial_value=ls_prior_mode), ) self.covar_module = covar_module self._x0 = None # will store temporary results for warm-starting if self.datapoints is not None and self.comparisons is not None: self.to(dtype=self.datapoints.dtype, device=self.datapoints.device) self._update() # Find f_map for initial parameters self.to(self.datapoints)
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[MultitaskGaussianLikelihood] = None, data_covar_module: Optional[Module] = None, task_covar_prior: Optional[Prior] = None, rank: Optional[int] = None, input_transform: Optional[InputTransform] = None, outcome_transform: Optional[OutcomeTransform] = None, **kwargs: Any, ) -> None: r"""Multi-task GP with Kronecker structure, using a simple ICM kernel. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. likelihood: A `MultitaskGaussianLikelihood`. If omitted, uses a `MultitaskGaussianLikelihood` with a `GammaPrior(1.1, 0.05)` noise prior. data_covar_module: The module computing the covariance (Kernel) matrix in data space. If omitted, use a `MaternKernel`. task_covar_prior : A Prior on the task covariance matrix. Must operate on p.s.d. matrices. A common prior for this is the `LKJ` prior. If omitted, uses `LKJCovariancePrior` with `eta` parameter as specified in the keyword arguments (if not specified, use `eta=1.5`). rank: The rank of the ICM kernel. If omitted, use a full rank kernel. kwargs: Additional arguments to override default settings of priors, including: - eta: The eta parameter on the default LKJ task_covar_prior. A value of 1.0 is uninformative, values <1.0 favor stronger correlations (in magnitude), correlations vanish as eta -> inf. - sd_prior: A scalar prior over nonnegative numbers, which is used for the default LKJCovariancePrior task_covar_prior. - likelihood_rank: The rank of the task covariance matrix to fit. Defaults to 0 (which corresponds to a diagonal covariance matrix). Example: >>> train_X = torch.rand(10, 2) >>> train_Y = torch.cat([f_1(X), f_2(X)], dim=-1) >>> model = KroneckerMultiTaskGP(train_X, train_Y) """ with torch.no_grad(): transformed_X = self.transform_inputs( X=train_X, input_transform=input_transform) if outcome_transform is not None: train_Y, _ = outcome_transform(train_Y) self._validate_tensor_args(X=transformed_X, Y=train_Y) self._num_outputs = train_Y.shape[-1] batch_shape, ard_num_dims = train_X.shape[:-2], train_X.shape[-1] num_tasks = train_Y.shape[-1] if rank is None: rank = num_tasks if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = MultitaskGaussianLikelihood( num_tasks=num_tasks, batch_shape=batch_shape, noise_prior=noise_prior, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), rank=kwargs.get("likelihood_rank", 0), ) if task_covar_prior is None: task_covar_prior = LKJCovariancePrior( n=num_tasks, eta=torch.tensor(kwargs.get("eta", 1.5)).to(train_X), sd_prior=kwargs.get( "sd_prior", SmoothedBoxPrior(math.exp(-6), math.exp(1.25), 0.05), ), ) super().__init__(train_X, train_Y, likelihood) self.mean_module = MultitaskMean( base_means=ConstantMean(batch_shape=batch_shape), num_tasks=num_tasks) if data_covar_module is None: data_covar_module = MaternKernel( nu=2.5, ard_num_dims=ard_num_dims, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=batch_shape, ) else: data_covar_module = data_covar_module self.covar_module = MultitaskKernel( data_covar_module=data_covar_module, num_tasks=num_tasks, rank=rank, batch_shape=batch_shape, task_covar_prior=task_covar_prior, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform self.to(train_X)
def __init__( self, indices: List[int], transform_on_train: bool = True, transform_on_eval: bool = True, transform_on_fantasize: bool = True, reverse: bool = False, eps: float = 1e-7, concentration1_prior: Optional[Prior] = None, concentration0_prior: Optional[Prior] = None, batch_shape: Optional[torch.Size] = None, ) -> None: r"""Initialize transform. Args: indices: The indices of the inputs to warp. transform_on_train: A boolean indicating whether to apply the transforms in train() mode. Default: True. transform_on_eval: A boolean indicating whether to apply the transform in eval() mode. Default: True. transform_on_fantasize: A boolean indicating whether to apply the transform when called from within a `fantasize` call. Default: True. reverse: A boolean indicating whether the forward pass should untransform the inputs. eps: A small value used to clip values to be in the interval (0, 1). concentration1_prior: A prior distribution on the concentration1 parameter of the Kumaraswamy distribution. concentration0_prior: A prior distribution on the concentration0 parameter of the Kumaraswamy distribution. batch_shape: The batch shape. """ super().__init__() self.register_buffer("indices", torch.tensor(indices, dtype=torch.long)) self.transform_on_train = transform_on_train self.transform_on_eval = transform_on_eval self.transform_on_fantasize = transform_on_fantasize self.reverse = reverse self.batch_shape = batch_shape or torch.Size([]) self._X_min = eps self._X_range = 1 - 2 * eps if len(self.batch_shape) > 0: # Note: this follows the gpytorch shape convention for lengthscales # There is ongoing discussion about the extra `1`. # TODO: update to follow new gpytorch convention resulting from # https://github.com/cornellius-gp/gpytorch/issues/1317 batch_shape = self.batch_shape + torch.Size([1]) else: batch_shape = self.batch_shape for i in (0, 1): p_name = f"concentration{i}" self.register_parameter( p_name, nn.Parameter(torch.full(batch_shape + self.indices.shape, 1.0)), ) if concentration0_prior is not None: self.register_prior( "concentration0_prior", concentration0_prior, lambda m: m.concentration0, lambda m, v: m._set_concentration(i=0, value=v), ) if concentration1_prior is not None: self.register_prior( "concentration1_prior", concentration1_prior, lambda m: m.concentration1, lambda m, v: m._set_concentration(i=1, value=v), ) for i in (0, 1): p_name = f"concentration{i}" constraint = GreaterThan( self._min_concentration_level, transform=None, # set the initial value to be the identity transformation initial_value=1.0, ) self.register_constraint(param_name=p_name, constraint=constraint)
def step(self, snapshot_mode: str = 'latest', meta_info: dict = None): # Save snapshot to save the correct iteration count self.save_snapshot() if self.curr_checkpoint == -2: # Train the initial policies in the source domain self.train_init_policies() self.reached_checkpoint() # setting counter to -1 if self.curr_checkpoint == -1: # Evaluate the initial policies in the target domain self.eval_init_policies() self.reached_checkpoint() # setting counter to 0 if self.curr_checkpoint == 0: # Normalize the input data and standardize the output data cands_norm = self.ddp_projector.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get('beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement(gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand_norm, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.ddp_space.flat_dim), to.ones(self.ddp_space.flat_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples ) next_cand = self.ddp_projector.project_back(cand_norm) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) pyrado.save(self.cands, 'candidates', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 1 if self.curr_checkpoint == 1: # Train and evaluate a new policy, repeat if the resulting policy did not exceed the success threshold wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subrtn.item(), self.max_subrtn_rep )(self.train_policy_sim) wrapped_trn_fcn(self.cands[-1, :], prefix=f'iter_{self._curr_iter}') self.reached_checkpoint() # setting counter to 2 if self.curr_checkpoint == 2: # Evaluate the current policy in the target domain policy = pyrado.load(self.policy, 'policy', 'pt', self.save_dir, meta_info=dict(prefix=f'iter_{self._curr_iter}')) self.curr_cand_value = self.eval_policy( self.save_dir, self._env_real, policy, self.mc_estimator, f'iter_{self._curr_iter}', self.num_eval_rollouts_real ) self.cands_values = to.cat([self.cands_values, self.curr_cand_value.view(1)], dim=0) pyrado.save(self.cands_values, 'candidates_values', 'pt', self.save_dir, meta_info) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.ddp_space, self.acq_restarts, self.acq_samples ) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) pyrado.save(self.argmax_cand, 'candidates_argmax', 'pt', self.save_dir, meta_info) self.reached_checkpoint() # setting counter to 0
def __init__( self, train_X: Tensor, train_Y: Tensor, cat_dims: List[int], cont_kernel_factory: Optional[Callable[[int, List[int]], Kernel]] = None, likelihood: Optional[Likelihood] = None, outcome_transform: Optional[OutcomeTransform] = None, # TODO input_transform: Optional[InputTransform] = None, # TODO ) -> None: r"""A single-task exact GP model supporting categorical parameters. Args: train_X: A `batch_shape x n x d` tensor of training features. train_Y: A `batch_shape x n x m` tensor of training observations. cat_dims: A list of indices corresponding to the columns of the input `X` that should be considered categorical features. cont_kernel_factory: A method that accepts `ard_num_dims` and `active_dims` arguments and returns an instatiated GPyTorch `Kernel` object to be used as the ase kernel for the continuous dimensions. If omitted, this model uses a Matern-2.5 kernel as the kernel for the ordinal parameters. likelihood: A likelihood. If omitted, use a standard GaussianLikelihood with inferred noise level. # outcome_transform: An outcome transform that is applied to the # training data during instantiation and to the posterior during # inference (that is, the `Posterior` obtained by calling # `.posterior` on the model will be on the original scale). # input_transform: An input transform that is applied in the model's # forward pass. Example: >>> train_X = torch.cat( [torch.rand(20, 2), torch.randint(3, (20, 1))], dim=-1) ) >>> train_Y = ( torch.sin(train_X[..., :-1]).sum(dim=1, keepdim=True) + train_X[..., -1:] ) >>> model = MixedSingleTaskGP(train_X, train_Y, cat_dims=[-1]) """ if outcome_transform is not None: raise UnsupportedError("outcome transforms not yet supported") if input_transform is not None: raise UnsupportedError("input transforms not yet supported") if len(cat_dims) == 0: raise ValueError( "Must specify categorical dimensions for MixedSingleTaskGP" ) input_batch_shape, aug_batch_shape = self.get_batch_dimensions( train_X=train_X, train_Y=train_Y ) if cont_kernel_factory is None: def cont_kernel_factory( batch_shape: torch.Size, ard_num_dims: int, active_dims: List[int] ) -> MaternKernel: return MaternKernel( nu=2.5, batch_shape=batch_shape, ard_num_dims=ard_num_dims, active_dims=active_dims, ) if likelihood is None: # This Gamma prior is quite close to the Horseshoe prior min_noise = 1e-5 if train_X.dtype == torch.float else 1e-6 likelihood = GaussianLikelihood( batch_shape=aug_batch_shape, noise_constraint=GreaterThan( min_noise, transform=None, initial_value=1e-3 ), noise_prior=GammaPrior(0.9, 10.0), ) d = train_X.shape[-1] cat_dims = normalize_indices(indices=cat_dims, d=d) ord_dims = sorted(set(range(d)) - set(cat_dims)) if len(ord_dims) == 0: covar_module = ScaleKernel( CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), ) ) else: sum_kernel = ScaleKernel( cont_kernel_factory( batch_shape=aug_batch_shape, ard_num_dims=len(ord_dims), active_dims=ord_dims, ) + ScaleKernel( CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), active_dims=cat_dims, ) ) ) prod_kernel = ScaleKernel( cont_kernel_factory( batch_shape=aug_batch_shape, ard_num_dims=len(ord_dims), active_dims=ord_dims, ) * CategoricalKernel( batch_shape=aug_batch_shape, ard_num_dims=len(cat_dims), active_dims=cat_dims, ) ) covar_module = sum_kernel + prod_kernel super().__init__( train_X=train_X, train_Y=train_Y, likelihood=likelihood, covar_module=covar_module, outcome_transform=outcome_transform, input_transform=input_transform, )
def __init__( self, train_X: Tensor, train_Y: Tensor, likelihood: Optional[Likelihood] = None, covar_modules: Optional[List[Kernel]] = None, num_latent_dims: Optional[List[int]] = None, learn_latent_pars: bool = True, latent_init: str = "default", outcome_transform: Optional[OutcomeTransform] = None, input_transform: Optional[InputTransform] = None, ): r"""A HigherOrderGP model for high-dim output regression. Args: train_X: A `batch_shape x n x d`-dim tensor of training inputs. train_Y: A `batch_shape x n x output_shape`-dim tensor of training targets. likelihood: Gaussian likelihood for the model. covar_modules: List of kernels for each output structure. num_latent_dims: Sizes for the latent dimensions. learn_latent_pars: If true, learn the latent parameters. latent_init: [default or gp] how to initialize the latent parameters. """ if input_transform is not None: input_transform.to(train_X) # infer the dimension of `output_shape`. num_output_dims = train_Y.dim() - train_X.dim() + 1 batch_shape = train_X.shape[:-2] if len(batch_shape) > 1: raise NotImplementedError( "HigherOrderGP currently only supports 1-dim `batch_shape`." ) if outcome_transform is not None: if isinstance(outcome_transform, Standardize) and not isinstance( outcome_transform, FlattenedStandardize ): warnings.warn( "HigherOrderGP does not support the outcome_transform " "`Standardize`! Using `FlattenedStandardize` with `output_shape=" f"{train_Y.shape[- num_output_dims:]} and batch_shape=" f"{batch_shape} instead.", RuntimeWarning, ) outcome_transform = FlattenedStandardize( output_shape=train_Y.shape[-num_output_dims:], batch_shape=batch_shape, ) train_Y, _ = outcome_transform(train_Y) self._aug_batch_shape = batch_shape self._num_dimensions = num_output_dims + 1 self._num_outputs = train_Y.shape[0] if batch_shape else 1 self.target_shape = train_Y.shape[-num_output_dims:] self._input_batch_shape = batch_shape if likelihood is None: noise_prior = GammaPrior(1.1, 0.05) noise_prior_mode = (noise_prior.concentration - 1) / noise_prior.rate likelihood = GaussianLikelihood( noise_prior=noise_prior, batch_shape=self._aug_batch_shape, noise_constraint=GreaterThan( MIN_INFERRED_NOISE_LEVEL, transform=None, initial_value=noise_prior_mode, ), ) else: self._is_custom_likelihood = True super().__init__( train_X, train_Y.view(*self._aug_batch_shape, -1), likelihood=likelihood, ) if covar_modules is not None: self.covar_modules = ModuleList(covar_modules) else: self.covar_modules = ModuleList( [ MaternKernel( nu=2.5, lengthscale_prior=GammaPrior(3.0, 6.0), batch_shape=self._aug_batch_shape, ard_num_dims=1 if dim > 0 else train_X.shape[-1], ) for dim in range(self._num_dimensions) ] ) if num_latent_dims is None: num_latent_dims = [1] * (self._num_dimensions - 1) self.to(train_X.device) self._initialize_latents( latent_init=latent_init, num_latent_dims=num_latent_dims, learn_latent_pars=learn_latent_pars, device=train_Y.device, dtype=train_Y.dtype, ) if outcome_transform is not None: self.outcome_transform = outcome_transform if input_transform is not None: self.input_transform = input_transform
def step(self, snapshot_mode: str, meta_info: dict = None): if not self.initialized: # Start initialization phase self.train_init_policies() self.eval_init_policies() self.initialized = True # Normalize the input data and standardize the output data cands_norm = self.uc_normalizer.project_to(self.cands) cands_values_stdized = standardize(self.cands_values).unsqueeze(1) # Create and fit the GP model gp = SingleTaskGP(cands_norm, cands_values_stdized) gp.likelihood.noise_covar.register_constraint('raw_noise', GreaterThan(1e-5)) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) fit_gpytorch_model(mll) print_cbt('Fitted the GP.', 'g') # Acquisition functions if self.acq_fcn_type == 'UCB': acq_fcn = UpperConfidenceBound(gp, beta=self.acq_param.get( 'beta', 0.1), maximize=True) elif self.acq_fcn_type == 'EI': acq_fcn = ExpectedImprovement( gp, best_f=cands_values_stdized.max().item(), maximize=True) elif self.acq_fcn_type == 'PI': acq_fcn = ProbabilityOfImprovement( gp, best_f=cands_values_stdized.max().item(), maximize=True) else: raise pyrado.ValueErr(given=self.acq_fcn_type, eq_constraint="'UCB', 'EI', 'PI'") # Optimize acquisition function and get new candidate point cand, acq_value = optimize_acqf( acq_function=acq_fcn, bounds=to.stack([to.zeros(self.cand_dim), to.ones(self.cand_dim)]), q=1, num_restarts=self.acq_restarts, raw_samples=self.acq_samples) next_cand = self.uc_normalizer.project_back(cand) print_cbt(f'Found the next candidate: {next_cand.numpy()}', 'g') self.cands = to.cat([self.cands, next_cand], dim=0) to.save(self.cands, osp.join(self._save_dir, 'candidates.pt')) # Train and valuate the new candidate (saves to iter_{self._curr_iter}_policy.pt) prefix = f'iter_{self._curr_iter}' wrapped_trn_fcn = until_thold_exceeded( self.thold_succ_subroutine.item(), max_iter=self.max_subroutine_rep)(self.train_policy_sim) wrapped_trn_fcn(cand, prefix) # Evaluate the current policy on the target domain policy = to.load(osp.join(self._save_dir, f'{prefix}_policy.pt')) self.curr_cand_value = self.eval_policy(self._save_dir, self._env_real, policy, self.montecarlo_estimator, prefix, self.num_eval_rollouts_real) self.cands_values = to.cat( [self.cands_values, self.curr_cand_value.view(1)], dim=0) to.save(self.cands_values, osp.join(self._save_dir, 'candidates_values.pt')) # Store the argmax after training and evaluating curr_argmax_cand = BayRn.argmax_posterior_mean( self.cands, self.cands_values.unsqueeze(1), self.uc_normalizer, self.acq_restarts, self.acq_samples) self.argmax_cand = to.cat([self.argmax_cand, curr_argmax_cand], dim=0) to.save(self.argmax_cand, osp.join(self._save_dir, 'candidates_argmax.pt')) self.make_snapshot(snapshot_mode, float(to.mean(self.cands_values)), meta_info)
def __init__(self, test_data, args): # data buffer, only store training data, test_data will only be stored in GP model before the model is trained self.n = 0 self.data = None self.index_list = [] self.previous_loss = CUDA(torch.tensor(np.inf)) self.trigger_training = CUDA(torch.tensor(1e-4)) self.lr = args.lr self.state_dim = args.state_dim self.action_dim = args.action_dim self.input_dim = self.state_dim + self.action_dim self.gp_iter = args.gp_iter self.normalize_trigger = 1 self.eps = CUDA(torch.tensor(1e-10)) self.mu_x = CUDA(torch.zeros((self.input_dim))) self.sigma_x = CUDA(torch.ones((self.input_dim))) #self.sigma_x[9:12] = CUDA(torch.tensor(10.0)) #self.sigma_x[12:18] = CUDA(torch.tensor(10.0)) self.mu_y = CUDA(torch.zeros((self.state_dim))) self.sigma_y = CUDA(torch.ones((self.state_dim))) #self.sigma_y[9:12] = CUDA(torch.tensor(10.0)) #self.sigma_y[12:18] = CUDA(torch.tensor(10.0)) # parameters for inducing GP self.max_inducing_point = args.max_inducing_point self.trigger_induce = args.trigger_induce self.sample_number = args.sample_number # prior of the kernel parameters # [NOTE] these prior parameters should be similar to the estimated parameters of real data # if lengthscale is too large, it will be too difficult to create new components # if lengthscale is too small, it will be too esay to create new components # if noise_covar is too large, the prediction will be inaccurate # if noise_covar is too small, the covariance will be very small, causing some numerical problems self.param = CUDA(torch.tensor(args.param)) # initialize model and likelihood model_list = [] likelihood_list = [] for m_i in range(self.state_dim): likelihood = CUDA( gpytorch.likelihoods.GaussianLikelihood( noise_constraint=GreaterThan(self.param[1]))) model = CUDA( SampleGPR(None, None, likelihood, self.input_dim, self.param)) model.reset_parameters() likelihood_list.append(model.likelihood) model_list.append(model) # initialize model list self.model = gpytorch.models.IndependentModelList(*model_list) self.likelihood = gpytorch.likelihoods.LikelihoodList(*likelihood_list) # initialize optimizer self.optimizer = torch.optim.Adam([{ 'params': self.model.parameters() }], lr=self.lr) self.mll = gpytorch.mlls.SumMarginalLogLikelihood( self.likelihood, self.model) # change the flag self.model.eval() self.likelihood.eval()