def fit_gpytorch_torch( mll: MarginalLogLikelihood, bounds: Optional[ParameterBounds] = None, optimizer_cls: Optimizer = Adam, options: Optional[Dict[str, Any]] = None, track_iterations: bool = True, approx_mll: bool = True, ) -> Tuple[MarginalLogLikelihood, Dict[str, Union[float, List[OptimizationIteration]]]]: r"""Fit a gpytorch model by maximizing MLL with a torch optimizer. The model and likelihood in mll must already be in train mode. Note: this method requires that the model has `train_inputs` and `train_targets`. Args: mll: MarginalLogLikelihood to be maximized. bounds: A ParameterBounds dictionary mapping parameter names to tuples of lower and upper bounds. Bounds specified here take precedence over bounds on the same parameters specified in the constraints registered with the module. optimizer_cls: Torch optimizer to use. Must not require a closure. options: options for model fitting. Relevant options will be passed to the `optimizer_cls`. Additionally, options can include: "disp" to specify whether to display model fitting diagnostics and "maxiter" to specify the maximum number of iterations. track_iterations: Track the function values and wall time for each iteration. approx_mll: If True, use gpytorch's approximate MLL computation ( according to the gpytorch defaults based on the training at size). Unlike for the deterministic algorithms used in fit_gpytorch_scipy, this is not an issue for stochastic optimizers. Returns: 2-element tuple containing - mll with parameters optimized in-place. - Dictionary with the following key/values: "fopt": Best mll value. "wall_time": Wall time of fitting. "iterations": List of OptimizationIteration objects with information on each iteration. If track_iterations is False, will be empty. Example: >>> gp = SingleTaskGP(train_X, train_Y) >>> mll = ExactMarginalLogLikelihood(gp.likelihood, gp) >>> mll.train() >>> fit_gpytorch_torch(mll) >>> mll.eval() """ optim_options = {"maxiter": 100, "disp": True, "lr": 0.05} optim_options.update(options or {}) exclude = optim_options.pop("exclude", None) if exclude is not None: mll_params = [ t for p_name, t in mll.named_parameters() if p_name not in exclude ] else: mll_params = list(mll.parameters()) optimizer = optimizer_cls( params=[{"params": mll_params}], **_filter_kwargs(optimizer_cls, **optim_options), ) # get bounds specified in model (if any) bounds_: ParameterBounds = {} if hasattr(mll, "named_parameters_and_constraints"): for param_name, _, constraint in mll.named_parameters_and_constraints(): if constraint is not None and not constraint.enforced: bounds_[param_name] = constraint.lower_bound, constraint.upper_bound # update with user-supplied bounds (overwrites if already exists) if bounds is not None: bounds_.update(bounds) iterations = [] t1 = time.time() param_trajectory: Dict[str, List[Tensor]] = { name: [] for name, param in mll.named_parameters() } loss_trajectory: List[float] = [] i = 0 stop = False stopping_criterion = ExpMAStoppingCriterion( **_filter_kwargs(ExpMAStoppingCriterion, **optim_options) ) train_inputs, train_targets = mll.model.train_inputs, mll.model.train_targets while not stop: optimizer.zero_grad() with gpt_settings.fast_computations(log_prob=approx_mll): output = mll.model(*train_inputs) # we sum here to support batch mode args = [output, train_targets] + _get_extra_mll_args(mll) loss = -mll(*args).sum() loss.backward() loss_trajectory.append(loss.item()) for name, param in mll.named_parameters(): param_trajectory[name].append(param.detach().clone()) if optim_options["disp"] and ( (i + 1) % 10 == 0 or i == (optim_options["maxiter"] - 1) ): print(f"Iter {i + 1}/{optim_options['maxiter']}: {loss.item()}") if track_iterations: iterations.append(OptimizationIteration(i, loss.item(), time.time() - t1)) optimizer.step() # project onto bounds: if bounds_: for pname, param in mll.named_parameters(): if pname in bounds_: param.data = param.data.clamp(*bounds_[pname]) i += 1 stop = stopping_criterion.evaluate(fvals=loss.detach()) info_dict = { "fopt": loss_trajectory[-1], "wall_time": time.time() - t1, "iterations": iterations, } return mll, info_dict
def fit_gpytorch_scipy( mll: MarginalLogLikelihood, bounds: Optional[ParameterBounds] = None, method: str = "L-BFGS-B", options: Optional[Dict[str, Any]] = None, track_iterations: bool = True, approx_mll: bool = False, scipy_objective: TScipyObjective = _scipy_objective_and_grad, module_to_array_func: TModToArray = module_to_array, module_from_array_func: TArrayToMod = set_params_with_array, ) -> Tuple[MarginalLogLikelihood, Dict[str, Union[float, List[OptimizationIteration]]]]: r"""Fit a gpytorch model by maximizing MLL with a scipy optimizer. The model and likelihood in mll must already be in train mode. This method requires that the model has `train_inputs` and `train_targets`. Args: mll: MarginalLogLikelihood to be maximized. bounds: A dictionary mapping parameter names to tuples of lower and upper bounds. method: Solver type, passed along to scipy.minimize. options: Dictionary of solver options, passed along to scipy.minimize. track_iterations: Track the function values and wall time for each iteration. approx_mll: If True, use gpytorch's approximate MLL computation. This is disabled by default since the stochasticity is an issue for determistic optimizers). Enabling this is only recommended when working with large training data sets (n>2000). Returns: 2-element tuple containing - MarginalLogLikelihood with parameters optimized in-place. - Dictionary with the following key/values: "fopt": Best mll value. "wall_time": Wall time of fitting. "iterations": List of OptimizationIteration objects with information on each iteration. If track_iterations is False, will be empty. "OptimizeResult": The result returned by `scipy.optim.minimize`. Example: >>> gp = SingleTaskGP(train_X, train_Y) >>> mll = ExactMarginalLogLikelihood(gp.likelihood, gp) >>> mll.train() >>> fit_gpytorch_scipy(mll) >>> mll.eval() """ options = options or {} x0, property_dict, bounds = module_to_array_func( module=mll, bounds=bounds, exclude=options.pop("exclude", None) ) x0 = x0.astype(np.float64) if bounds is not None: bounds = Bounds(lb=bounds[0], ub=bounds[1], keep_feasible=True) xs = [] ts = [] t1 = time.time() def store_iteration(xk): xs.append(xk.copy()) ts.append(time.time() - t1) cb = store_iteration if track_iterations else None with gpt_settings.fast_computations(log_prob=approx_mll): res = minimize( scipy_objective, x0, args=(mll, property_dict), bounds=bounds, method=method, jac=True, options=options, callback=cb, ) iterations = [] if track_iterations: for i, xk in enumerate(xs): obj, _ = scipy_objective(x=xk, mll=mll, property_dict=property_dict) iterations.append(OptimizationIteration(i, obj, ts[i])) # Construct info dict info_dict = { "fopt": float(res.fun), "wall_time": time.time() - t1, "iterations": iterations, "OptimizeResult": res, } if not res.success: try: # Some res.message are bytes msg = res.message.decode("ascii") except AttributeError: # Others are str msg = res.message warnings.warn( f"Fitting failed with the optimizer reporting '{msg}'", OptimizationWarning ) # Set to optimum mll = module_from_array_func(mll, res.x, property_dict) return mll, info_dict
def test_GPyTorchPosterior(self): for dtype in (torch.float, torch.double): n = 3 mean = torch.rand(n, dtype=dtype, device=self.device) variance = 1 + torch.rand(n, dtype=dtype, device=self.device) covar = variance.diag() mvn = MultivariateNormal(mean, lazify(covar)) posterior = GPyTorchPosterior(mvn=mvn) # basics self.assertEqual(posterior.device.type, self.device.type) self.assertTrue(posterior.dtype == dtype) self.assertEqual(posterior.event_shape, torch.Size([n, 1])) self.assertTrue(torch.equal(posterior.mean, mean.unsqueeze(-1))) self.assertTrue(torch.equal(posterior.variance, variance.unsqueeze(-1))) # rsample samples = posterior.rsample() self.assertEqual(samples.shape, torch.Size([1, n, 1])) for sample_shape in ([4], [4, 2]): samples = posterior.rsample(sample_shape=torch.Size(sample_shape)) self.assertEqual(samples.shape, torch.Size(sample_shape + [n, 1])) # check enabling of approximate root decomposition with ExitStack() as es: mock_func = es.enter_context( mock.patch( ROOT_DECOMP_PATH, return_value=torch.linalg.cholesky(covar) ) ) es.enter_context(gpt_settings.max_cholesky_size(0)) es.enter_context( gpt_settings.fast_computations(covar_root_decomposition=True) ) # need to clear cache, cannot re-use previous objects mvn = MultivariateNormal(mean, lazify(covar)) posterior = GPyTorchPosterior(mvn=mvn) posterior.rsample(sample_shape=torch.Size([4])) mock_func.assert_called_once() # rsample w/ base samples base_samples = torch.randn(4, 3, 1, device=self.device, dtype=dtype) # incompatible shapes with self.assertRaises(RuntimeError): posterior.rsample( sample_shape=torch.Size([3]), base_samples=base_samples ) # ensure consistent result for sample_shape in ([4], [4, 2]): base_samples = torch.randn( *sample_shape, 3, 1, device=self.device, dtype=dtype ) samples = [ posterior.rsample( sample_shape=torch.Size(sample_shape), base_samples=base_samples ) for _ in range(2) ] self.assertTrue(torch.allclose(*samples)) # collapse_batch_dims b_mean = torch.rand(2, 3, dtype=dtype, device=self.device) b_variance = 1 + torch.rand(2, 3, dtype=dtype, device=self.device) b_covar = torch.diag_embed(b_variance) b_mvn = MultivariateNormal(b_mean, lazify(b_covar)) b_posterior = GPyTorchPosterior(mvn=b_mvn) b_base_samples = torch.randn(4, 1, 3, 1, device=self.device, dtype=dtype) b_samples = b_posterior.rsample( sample_shape=torch.Size([4]), base_samples=b_base_samples ) self.assertEqual(b_samples.shape, torch.Size([4, 2, 3, 1]))
def benchmark_on_n_pts(n_pts, create_model_func, target_func, ho_x, ho_y, fit=True, repeats=3, max_iter=1000, return_model=False, verbose=0, checkpoint=True, print_freq=1, use_chol=False, **kwargs): dims = ho_x.shape[1] # if n_pts > 20: # ho_x = ho_x.to(device) # ho_y = ho_y.to(device) rep_mses = [] models = [] mlls = [] for i in range(repeats): # Don't edit the master copies fo the hold-out dataset test_ho_x = torch.empty_like(ho_x).copy_(ho_x) test_ho_y = torch.empty_like(ho_y).copy_(ho_y) # test_ho_x = ho_x.copy_() # test_ho_y = ho_y.copy_() # Create the data. data = torch.rand(n_pts, dims) * 4 - 2 y = target_func(data) + torch.randn(n_pts) * 0.01 # Normalize by TEST in this case for all methods for more accurate comparison m = ho_x.mean(dim=0) s = ho_x.std(dim=0) data = (data - m) / s test_ho_x = (test_ho_x - m) / s # Do the same for Ys. m = ho_y.mean() s = ho_y.std() y = (y - m) / s test_ho_y = (test_ho_y - m) / s # Create the model now. model = create_model_func(data, y, **kwargs) # Put things on the GPU if necessary if n_pts > 20: test_ho_x = test_ho_x.to(device) test_ho_y = test_ho_y.to(device) model = model.to(device) data = data.to(device) y = y.to(device) fast = not use_chol with gp_set.fast_computations(fast, fast, fast), gp_set.max_cg_iterations(10_000): with gp_set.cg_tolerance(0.001), gp_set.eval_cg_tolerance( 0.0005), gp_set.memory_efficient(True): if fit: mll = ExactMarginalLogLikelihood(model.likelihood, model) train_to_convergence(model, data, y, torch.optim.Adam, objective=mll, checkpoint=checkpoint, max_iter=max_iter, print_freq=print_freq, verbose=verbose) model.eval() with torch.no_grad(): mse = mean_squared_error(model(test_ho_x).mean, test_ho_y) print(i, mse) rep_mses.append(mse) if return_model: models.append(model) mlls.append(mll) else: del mll del model del data del y
def fit_gpytorch_manifold( mll: MarginalLogLikelihood, bounds: Optional[ParameterBounds] = None, solver: Solver = pyman_solvers.ConjugateGradient(maxiter=500), nb_init_candidates: int = 200, last_x_as_candidate_prob: float = 0.9, options: Optional[Dict[str, Any]] = None, track_iterations: bool = True, approx_mll: bool = False, module_to_array_func: TModToArray = module_to_list_of_array, module_from_array_func: TArrayToMod = set_params_with_list_of_array, ) -> Tuple[MarginalLogLikelihood, Dict[str, Union[ float, List[OptimizationIteration]]]]: """ This function fits a gpytorch model by maximizing MLL with a pymanopt optimizer. The model and likelihood in mll must already be in train mode. This method requires that the model has `train_inputs` and `train_targets`. Parameters ---------- :param mll: MarginalLogLikelihood to be maximized. Optional parameters ------------------- :param nb_init_candidates: number of random initial candidates for the GP parameters :param last_x_as_candidate_prob: probability that the last set of parameter is among the initial candidates :param bounds: A dictionary mapping parameter names to tuples of lower and upper bounds. :param solver: Pymanopt solver. :param options: Dictionary of solver options, passed along to scipy.minimize. :param track_iterations: Track the function values and wall time for each iteration. :param approx_mll: If True, use gpytorch's approximate MLL computation. This is disabled by default since the stochasticity is an issue for determistic optimizers). Enabling this is only recommended when working with large training data sets (n>2000). Returns ------- :return: 2-element tuple containing - MarginalLogLikelihood with parameters optimized in-place. - Dictionary with the following key/values: "fopt": Best mll value. "wall_time": Wall time of fitting. "iterations": List of OptimizationIteration objects with information on each iteration. If track_iterations is False, will be empty. Example: gp = SingleTaskGP(train_X, train_Y) mll = ExactMarginalLogLikelihood(gp.likelihood, gp) mll.train() fit_gpytorch_scipy(mll) mll.eval() """ options = options or {} # Current parameters x0, property_dict, bounds = module_to_array_func(module=mll, bounds=bounds, exclude=options.pop( "exclude", None)) x0 = [x0i.astype(np.float64) for x0i in x0] if bounds is not None: warnings.warn( 'Bounds handling not supported yet in fit_gpytorch_manifold') # bounds = Bounds(lb=bounds[0], ub=bounds[1], keep_feasible=True) t1 = time.time() # Define cost function def cost(x): param_dict = OrderedDict(mll.named_parameters()) idx = 0 for p_name, attrs in property_dict.items(): # Construct the new tensor if len(attrs.shape) == 0: # deal with scalar tensors # new_data = torch.tensor(x[0], dtype=attrs.dtype, device=attrs.device) new_data = torch.tensor(x[idx][0], dtype=attrs.dtype, device=attrs.device) else: # new_data = torch.tensor(x, dtype=attrs.dtype, device=attrs.device).view(*attrs.shape) new_data = torch.tensor(x[idx], dtype=attrs.dtype, device=attrs.device).view(*attrs.shape) param_dict[p_name].data = new_data idx += 1 # mllx = set_params_with_array(mll, x, property_dict) train_inputs, train_targets = mll.model.train_inputs, mll.model.train_targets mll.zero_grad() output = mll.model(*train_inputs) args = [output, train_targets] + _get_extra_mll_args(mll) loss = -mll(*args).sum() return loss def egrad(x): loss = cost(x) loss.backward() param_dict = OrderedDict(mll.named_parameters()) grad = [] for p_name in property_dict: t = param_dict[p_name].grad if t is None: # this deals with parameters that do not affect the loss if len(property_dict[p_name].shape ) > 1 and property_dict[p_name].shape[0] > 1: # if the variable is a matrix, keep its shape grad.append(np.zeros(property_dict[p_name].shape)) else: grad.append(np.zeros(property_dict[p_name].shape)) else: if t.ndim > 1 and t.shape[ 0] > 1: # if the variable is a matrix, keep its shape grad.append(t.detach().cpu().double().clone().numpy()) else: # Vector case grad.append( t.detach().view(-1).cpu().double().clone().numpy()) return grad # Define the manifold (product of manifolds) manifolds_list = [] for p_name, t in mll.named_parameters(): try: # If a manifold is given add it manifolds_list.append(attrgetter(p_name + "_manifold")(mll)) except AttributeError: # Otherwise, default: Euclidean manifolds_list.append( Euclidean(int(np.prod(property_dict[p_name].shape)))) # Product of manifolds manifold = Product(manifolds_list) # Instanciate the problem on the manifold if track_iterations: verbosity = 2 else: verbosity = 0 problem = Problem(manifold=manifold, cost=cost, egrad=egrad, verbosity=verbosity, arg=torch.Tensor()) #, precon=precon) # For cases where the Hessian is hard/long to compute, we approximate it with finite differences of the gradient. # Typical cases: the Hessian can be hard to compute due to the 2nd derivative of the eigenvalue decomposition, # e.g. in the SPD affine-invariant distance. problem._hess = types.MethodType(get_hessianfd, problem) # Choose initial parameters # Do not always consider x0, to encourage variations of the parameters. if np.random.rand() < last_x_as_candidate_prob: x0_candidates = [x0] x0_candidates += [ manifold.rand() for i in range(nb_init_candidates - 1) ] else: x0_candidates = [] x0_candidates += [manifold.rand() for i in range(nb_init_candidates)] for i in range(int(3 * nb_init_candidates / 4)): x0_candidates[i][0:4] = x0[0:4] #TODO remove hard-coding y0_candidates = [cost(x0_candidates[i]) for i in range(nb_init_candidates)] y_init, x_init_idx = torch.Tensor(y0_candidates).min(0) x_init = x0_candidates[x_init_idx] with gpt_settings.fast_computations(log_prob=approx_mll): # Logverbosity of the solver to 1 solver._logverbosity = 1 # Solve opt_x, opt_log = solver.solve(problem, x=x_init) # Construct info dict info_dict = { "fopt": float(cost(opt_x).detach().numpy()), "wall_time": time.time() - t1, "opt_log": opt_log, } # if not res.success: # TODO update # try: # # Some res.message are bytes # msg = res.message.decode("ascii") # except AttributeError: # # Others are str # msg = res.message # warnings.warn( # f"Fitting failed with the optimizer reporting '{msg}'", OptimizationWarning # ) # Set to optimum mll = module_from_array_func(mll, opt_x, property_dict) return mll, info_dict