def __max_gaussians_1d(self, means1: torch.Tensor, vars1: torch.tensor, means2: torch.tensor, vars2: torch.Tensor): """ Computes max of gaussians over a single dimension. Each element in the parameteres corresponds on of the parameters of one of the gaussians :param means1: a vector of gaussian means :param vars1: a vector of gaussian variances :param means2: :param vars2: :return: tuple (means, vars) with means and variances, respectively in tensors """ alpha = torch.sqrt(vars1 + vars2 + self._epsilon) beta = (means1 - means2) / alpha n = Normal(0, 1) cdf_beta = n.cdf(beta) cdf_neg_beta = n.cdf(-beta) pdf_beta = torch.exp(n.log_prob(beta)) mean_max = means1 * cdf_beta + means2 * cdf_neg_beta + alpha * pdf_beta # the way the variance is calculated here may cause it to become variance. Epsilon is added to try to avoid that var_max = (vars1 + torch.pow(means1, 2)) * cdf_beta + (vars2 + torch.pow(means2, 2)) * cdf_neg_beta + \ (means1 + means2) * alpha * pdf_beta - torch.pow(mean_max, 2) + self._epsilon if torch.any(var_max < 0): raise ValueError("Pooling layer: variance is negative. Epsilon should be increased") return mean_max, var_max
def log_prior(self, theta): theta = self.scale_theta(theta) if self.prior == 'uniform': return 0 else: prior = Normal(loc=self.prior_loc, scale=self.prior_scale) if self.num_bits is None: return sum_except_batch(prior.log_prob(theta)) else: return sum_except_batch(torch.log(prior.cdf(theta+1)-prior.cdf(theta)+1e-12))
def sample_truncated_normal_perturbations( X: Tensor, n_discrete_points: int, sigma: float, bounds: Tensor, qmc: bool = True, ) -> Tensor: r"""Sample points around `X`. Sample perturbed points around `X` such that the added perturbations are sampled from N(0, sigma^2 I) and truncated to be within [0,1]^d. Args: X: A `n x d`-dim tensor starting points. n_discrete_points: The number of points to sample. sigma: The standard deviation of the additive gaussian noise for perturbing the points. bounds: A `2 x d`-dim tensor containing the bounds. qmc: A boolean indicating whether to use qmc. Returns: A `n_discrete_points x d`-dim tensor containing the sampled points. """ X = normalize(X, bounds=bounds) d = X.shape[1] # sample points from N(X_center, sigma^2 I), truncated to be within # [0, 1]^d. if X.shape[0] > 1: rand_indices = torch.randint(X.shape[0], (n_discrete_points, ), device=X.device) X = X[rand_indices] if qmc: std_bounds = torch.zeros(2, d, dtype=X.dtype, device=X.device) std_bounds[1] = 1 u = draw_sobol_samples(bounds=std_bounds, n=n_discrete_points, q=1).squeeze(1) else: u = torch.rand((n_discrete_points, d), dtype=X.dtype, device=X.device) # compute bounds to sample from a = -X b = 1 - X # compute z-score of bounds alpha = a / sigma beta = b / sigma normal = Normal(0, 1) cdf_alpha = normal.cdf(alpha) # use inverse transform perturbation = normal.icdf(cdf_alpha + u * (normal.cdf(beta) - cdf_alpha)) * sigma # add perturbation and clip points that are still outside perturbed_X = (X + perturbation).clamp(0.0, 1.0) return unnormalize(perturbed_X, bounds=bounds)
def log_prob(self, x): log_prob = torch.zeros(x.shape[0], self.num_mix) for d in range(self.num_dims): xd_low = 2 * (x[:, d] / 2**self.num_bits) - 1 xd_high = 2 * ((x[:, d] + 1.0) / 2**self.num_bits) - 1 xd_low[x[:, d] == 0] = -1e16 xd_high[x[:, d] == 2**self.num_bits - 1] = +1e16 for m in range(self.num_mix): dm = Normal(self.loc[m, d], self.log_scale[m, d].exp()) prob_dm = dm.cdf(xd_high) - dm.cdf(xd_low) log_prob[:, m] += torch.log(prob_dm + self.eps) return torch.logsumexp(log_prob + torch.log_softmax(self.logit_pi, dim=-1), dim=-1)
def marginal_calibration(pred_y_mean, pred_y_var, y_true, ax): mean = torch.Tensor(pred_y_mean).cpu().squeeze() var = torch.Tensor(pred_y_var).cpu().squeeze() y = torch.Tensor(y_true).cpu().squeeze() dist = Normal(mean, var.sqrt()) # calc and display difference of empirical cdf an avg predictive cdf (like in # "Gneiting, T., Balabdaoui, F., & Raftery, A. E. (2007). Probabilistic forecasts, calibration and sharpness. # Journal of the Royal Statistical Society: Series B (Statistical Methodology), 69(2), 243-268.") emp_cdf = lambda x: (y <= x.unsqueeze(-1)).double().mean(-1) avg_pred_cdf = lambda x: dist.cdf(x.unsqueeze(-1)).mean(-1) min_x = y.min() max_x = y.max() eps = np.abs(max_x - min_x) * 0.05 min_x = min_x - eps max_x = max_x + eps step = (max_x - min_x) / 1000 plt_x = torch.arange(min_x, max_x, step) pcdf = avg_pred_cdf(plt_x) ecdf = emp_cdf(plt_x) dif = pcdf - ecdf ax.plot(plt_x, dif, color='lightblue') ax.plot(plt_x, np.repeat(0, plt_x.shape), color='lightgray', linestyle="--", alpha=0.75)
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Expected Improvement on the candidate set X. Args: X: A `b1 x ... bk x 1 x d`-dim batched tensor of `d`-dim design points. Expected Improvement is computed for each point individually, i.e., what is considered are the marginal posteriors, not the joint. Returns: A `b1 x ... bk`-dim tensor of Expected Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self.model.posterior(X) self._validate_single_output_posterior(posterior) mean = posterior.mean # deal with batch evaluation and broadcasting view_shape = mean.shape[:-2] if mean.dim() >= X.dim() else X.shape[:-2] mean = mean.view(view_shape) sigma = posterior.variance.clamp_min(1e-9).sqrt().view(view_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) return ei
def mean(self, context, feedback=None): # get mean of truncated normal mean, std = self.cond_dist_params(context, feedback=feedback) std_normal = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device)) adjusted_a = (0 - mean) / std additional = std * std_normal.log_prob(adjusted_a).exp() / (1 - std_normal.cdf(adjusted_a)) return mean + additional
def EI_fstar_known(model, x, fstar, min=True): ''' This function calculates the Expected Improvement (EGO) acquisition function INPUT : model : GPY.model - A GPY model from which we will estimate. x : Float - an x value to evaluate min : Boolean - determines if the function is a minimisation or maximisation problem OUTPUT : output : Float - returns the estimated improvement ''' x = torch.from_numpy(np.array([[x]]).reshape(-1, xdims)) #x = x.float() x = x.double() model.eval() posterior = model.posterior(x) mean = posterior.mean sigma = posterior.variance.clamp_min(1e-9).sqrt() val = mean - fstar u = val / sigma if min == True: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) return (ei.item(), mean.item())
def generate_parameters(self, parameter_id, **kwargs): if not self._model_initialized: return _random_config(self.searchspace_json, self.random_state) else: # random samples and pick best with model candidate_x = [ _random_config(self.searchspace_json, self.random_state) for _ in range(self.sample_size) ] x_test = np.array( [np.array(list(xi.values())) for xi in candidate_x]) m, v = self.model.predict(x_test) mean = torch.Tensor(m) sigma = torch.Tensor(v) u = (mean - torch.Tensor([0.95]).expand_as(mean)) / sigma normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) if self.optimize_mode == 'maximize': ind = torch.argmax(ei) else: ind = torch.argmin(ei) new_x = candidate_x[ind] return new_x
def eval(self, x: torch.FloatTensor, xe: torch.LongTensor) -> torch.FloatTensor: """ minimize (-1 * EI, -1 * PI, lcb) """ with torch.no_grad(): py, ps2 = self.model.predict(x, xe) noise = np.sqrt(2.0) * self.model.noise.sqrt() ps = ps2.sqrt() lcb = (py + noise * torch.randn(py.shape)) - self.kappa * ps normed = ( (self.tau - self.eps - py - noise * torch.randn(py.shape)) / ps) dist = Normal(0., 1.) log_phi = dist.log_prob(normed) Phi = dist.cdf(normed) PI = Phi EI = ps * (Phi * normed + log_phi.exp()) logEIapp = ps.log() - 0.5 * normed**2 - (normed**2 - 1).log() logPIapp = -0.5 * normed**2 - torch.log(-1 * normed) - torch.log( torch.sqrt(torch.tensor(2 * np.pi))) use_app = ~((normed > -6) & torch.isfinite(EI.log()) & torch.isfinite(PI.log())).reshape(-1) out = torch.zeros(x.shape[0], 3) out[:, 0] = lcb.reshape(-1) out[:, 1][use_app] = -1 * logEIapp[use_app].reshape(-1) out[:, 2][use_app] = -1 * logPIapp[use_app].reshape(-1) out[:, 1][~use_app] = -1 * EI[~use_app].log().reshape(-1) out[:, 2][~use_app] = -1 * PI[~use_app].log().reshape(-1) return out
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Expected Improvement on the candidate set X. Args: X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points. Expected Improvement is computed for each point individually, i.e., what is considered are the marginal posteriors, not the joint. Returns: A `(b1 x ... bk)`-dim tensor of Expected Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self.model.posterior( X=X, posterior_transform=self.posterior_transform) mean = posterior.mean # deal with batch evaluation and broadcasting view_shape = mean.shape[:-2] if mean.shape[-2] == 1 else mean.shape[:-1] mean = mean.view(view_shape) sigma = posterior.variance.clamp_min(1e-9).sqrt().view(view_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) return ei
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Constrained Expected Improvement on the candidate set X. Args: X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design points each. Returns: A `(b)`-dim Tensor of Expected Improvement values at the given design points `X`. """ posterior = self.model.posterior(X) means = posterior.mean.squeeze(dim=-2) # (b) x t sigmas = posterior.variance.squeeze(dim=-2).sqrt().clamp_min(1e-9) # (b) x t # (b) x 1 mean_obj = means[..., [self.objective_index]] sigma_obj = sigmas[..., [self.objective_index]] u = (mean_obj - self.best_f.expand_as(mean_obj)) / sigma_obj if not self.maximize: u = -u normal = Normal( torch.zeros(1, device=u.device, dtype=u.dtype), torch.ones(1, device=u.device, dtype=u.dtype), ) ei_pdf = torch.exp(normal.log_prob(u)) # (b) x 1 ei_cdf = normal.cdf(u) ei = sigma_obj * (ei_pdf + u * ei_cdf) prob_feas = self._compute_prob_feas(X=X, means=means, sigmas=sigmas) ei = ei.mul(prob_feas) return ei.squeeze(dim=-1)
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Constrained Expected Improvement on the candidate set X. Args: X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design points each. Returns: A `(b)`-dim Tensor of Expected Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self.model.posterior( X=X, posterior_transform=self.posterior_transform) means = posterior.mean.squeeze(dim=-2) # (b) x m sigmas = posterior.variance.squeeze(dim=-2).sqrt().clamp_min( 1e-9) # (b) x m # (b) x 1 oi = self.objective_index mean_obj = means[..., oi:oi + 1] sigma_obj = sigmas[..., oi:oi + 1] u = (mean_obj - self.best_f.expand_as(mean_obj)) / sigma_obj if not self.maximize: u = -u normal = Normal( torch.zeros(1, device=u.device, dtype=u.dtype), torch.ones(1, device=u.device, dtype=u.dtype), ) ei_pdf = torch.exp(normal.log_prob(u)) # (b) x 1 ei_cdf = normal.cdf(u) ei = sigma_obj * (ei_pdf + u * ei_cdf) prob_feas = self._compute_prob_feas(X=X, means=means, sigmas=sigmas) ei = ei.mul(prob_feas) return ei.squeeze(dim=-1)
def forward(self, X: Tensor) -> Tensor: r""" Approximates E_n[CVaR[F]] as described in ApxCVaRKG. :param X: The decision variable `x` and the `\beta` value. Shape: batch x num_fantasies x num_starting_sols x 1 x (dim_x + 1) (see below) :return: -E_n[CVaR[F(x, W)]]. Shape: batch x num_fantasies x num_starting_sols Note that the return value is negated since the optimizers we use do maximization. """ if X.requires_grad: torch.set_grad_enabled(True) # ensure X has the correct dtype and device X = X.to(self.w_samples) # make sure X has proper shape, 4 dimensional to match the batch shape of rhoKG assert X.shape[-1] == self.dim_x + 1 if X.dim() < 4: X = X.reshape(-1, *self.model._input_batch_shape, 1, self.dim_x + 1) X_fant = X[..., :self.dim_x] # batch x num_fantasies x n x 1 x dim_x beta = X[..., -1:] # batch x num_fantasies x n x 1 x 1 # Join X_fant with w_samples z_fant = torch.cat( [ X_fant.repeat(*[1] * (X_fant.dim() - 2), self.num_samples, 1), self.w_samples.repeat(*X_fant.shape[:-2], 1, 1), ], dim=-1, ) # get posterior mean and std dev with settings.propagate_grads(True): posterior = self.model.posterior(z_fant) mu = posterior.mean sigma = torch.sqrt(posterior.variance) # Calculate `E_f[[f(x) - \beta]^+]` u = (mu - beta.expand_as(mu)) / sigma # this is from EI normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) values = sigma * (updf + u * ucdf) # take the expectation over W if getattr(self, "weights", None) is None: values = torch.mean(values, dim=-2) else: # Get the expectation with weights values = values * self.weights.unsqueeze(-1) values = torch.sum(values, dim=-2) # add beta and divide by 1-alpha values = beta.view_as(values) + values / (1 - self.alpha) # return with last dim squeezed # negated since CVaR is being minimized return -values.squeeze(-1)
def sample(self, context, feedback=None): mean, std = self.cond_dist_params(context, feedback=feedback) std_normal = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device)) mu = self.mean(context) alpha = (0 - mean) / std z = 1 - std_normal.cdf(alpha) phi_alpha = std_normal.log_prob(alpha).exp() sigma = std * torch.sqrt(1 + alpha * phi_alpha / z - (phi_alpha / z) ** 2) dist = Normal(mu, sigma) return dist.rsample().abs()
def pit_calc(means, vars, targets): mean = torch.Tensor(means).cpu().squeeze() var = torch.Tensor(vars).cpu().squeeze() y = torch.Tensor(targets).cpu().squeeze() dist = Normal(mean, var.sqrt()) pt = dist.cdf(y) pt = pt.squeeze().numpy() return pt
def interval_coverage(pred_y_mean, pred_y_var, y_true, interval): # "the proportion of the time that the interval contains the true value of interest" mean = torch.Tensor(pred_y_mean).cpu() var = torch.Tensor(pred_y_var).cpu() y = torch.Tensor(y_true).cpu() dist = Normal(mean, var.sqrt()) cov = dist.cdf(y) <= interval cov = cov.sum() / float(cov.shape[0]) return cov.numpy()
def log_prob(self, x, context, should_sum=True, feedback=None): mean, std = self.cond_dist_params(context, feedback=feedback) dist = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device)) adjusted_x = (x - mean) / std adjusted_a = (0 - mean) / std log_gx = dist.log_prob(adjusted_x) log_c = ((1 - dist.cdf(adjusted_a)) * std).log() log_prob = log_gx - log_c # return sum_except_batch(dist.log_prob((x - mean).abs())) ''' # Folded normal distribution mean, std = self.cond_dist_params(context) dist1 = Normal(mean, std) dist2 = Normal(-mean, std) log_prob = (dist1.log_prob(x).exp() + dist2.log_prob(x).exp()).log() ''' if should_sum: return sum_except_batch(log_prob) else: return log_prob
def crps_torch(mean, std, target): # crps # Gneiting, T., Raftery, A. E., Westveld III, A. H., & Goldman, T. (2005). # Calibrated probabilistic forecasting using ensemble model output statistics and minimum CRPS estimation. # Monthly Weather Review, 133(5), 1098-1118. # Formula 5 sx = (target - mean) / std normal = Normal(torch.Tensor([0]).to(sx.device), torch.Tensor([1]).to(sx.device)) pdf = normal.log_prob(sx).exp() cdf = normal.cdf(sx) assert pdf.shape == cdf.shape == sx.shape == target.shape crps = std * (sx * (2 * cdf - 1) + 2 * pdf - crps_const.to(sx.device)) assert crps.shape == target.shape return crps.mean(0)
def forward(self, X: Tensor) -> Tensor: r"""Evaluate the Probability of Improvement on the candidate set X. Args: X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design points each. Returns: A `(b)`-dim tensor of Probability of Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self._get_posterior(X=X) mean, sigma = posterior.mean, posterior.variance.sqrt() batch_shape = X.shape[:-2] mean = posterior.mean.view(batch_shape) sigma = posterior.variance.sqrt().clamp_min(1e-9).view(batch_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) return normal.cdf(u)
def forward(self, X: Tensor) -> Tensor: r"""Evaluate the Probability of Improvement on the candidate set X. Args: X: A `(b1 x ... bk) x 1 x d`-dim batched tensor of `d`-dim design points. Returns: A `(b1 x ... bk)`-dim tensor of Probability of Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self.model.posterior( X=X, posterior_transform=self.posterior_transform) mean, sigma = posterior.mean, posterior.variance.sqrt().clamp_min(1e-9) view_shape = mean.shape[:-2] if mean.shape[-2] == 1 else mean.shape[:-1] mean = mean.view(view_shape) sigma = sigma.view(view_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) return normal.cdf(u)
def forward(self, X: Tensor) -> Tensor: """ :param X: A (..., 1, input_dim) batched tensor of input_dim design points. Expected Improvement is computed for each point individually, i.e., what is considered are the marginal posteriors, not the joint. :return: A (...) tensor of Expected Improvement values at the given design points `X`. """ with torch.no_grad(): # both (..., 1,) # (..., input_dim) X_features = X.detach().numpy().squeeze(1) mu_est, sigma_est = self.mean_std_predictor(X_features) # both (..., 1, 1) mu_est = torch.Tensor(mu_est).unsqueeze(1) sigma_est = torch.Tensor(sigma_est).unsqueeze(1) posterior = self._get_posterior(X=X) mean, sigma = scale_posterior( mu_posterior=posterior.mean, sigma_posterior=posterior.variance.clamp_min(1e-6).sqrt(), mu_est=mu_est, sigma_est=sigma_est, ) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) return ei.squeeze(dim=-1).squeeze(dim=-1)
def forward(self, X: Tensor) -> Tensor: r"""Evaluate the Probability of Improvement on the candidate set X. Args: X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design points each. Returns: A `(b)`-dim tensor of Probability of Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) batch_shape = X.shape[:-2] posterior = self.model.posterior(X) self._validate_single_output_posterior(posterior) mean, sigma = posterior.mean, posterior.variance.sqrt() mean = posterior.mean.view(batch_shape) sigma = posterior.variance.sqrt().clamp_min(1e-9).view(batch_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) return normal.cdf(u)
def generate_parameters(self, parameter_id, **kwargs): if not self._model_initialized: return _random_config(self.searchspace_json, self.random_state) else: # random samples and pick best with model candidate_x = [ _random_config(self.searchspace_json, self.random_state) for _ in range(self.sample_size) ] # The model has NaN issue when all the candidates are same # Also we can save the predict time when this happens if all(x == candidate_x[0] for x in candidate_x): return candidate_x[0] x_test = np.array( [np.array(list(xi.values())) for xi in candidate_x]) m, v = self.model.predict(x_test) # The model has NaN issue when all the candidates are very close if np.isnan(m).any() or np.isnan(v).any(): return candidate_x[0] mean = torch.Tensor(m) sigma = torch.Tensor(v) u = (mean - torch.Tensor([0.95]).expand_as(mean)) / sigma normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) if self.optimize_mode == 'maximize': ind = torch.argmax(ei) else: ind = torch.argmin(ei) new_x = candidate_x[ind] return new_x
def eval(self, x: nx.Graph, asscalar=False): """ Return the negative expected improvement at the query point x2 """ from torch.distributions import Normal try: mu, cov = self.gp.predict(x) except: return -1. # in case of error. return ei of -1 std = torch.sqrt(torch.diag(cov)) mu_star = self._get_incumbent() gauss = Normal(torch.zeros(1, device=mu.device), torch.ones(1, device=mu.device)) u = (mu - mu_star - self.xi) / std ucdf = gauss.cdf(u) updf = torch.exp(gauss.log_prob(u)) ei = std * updf + (mu - mu_star - self.xi) * ucdf if self.augmented_ei: sigma_n = self.gp.likelihood ei *= (1. - torch.sqrt(torch.tensor(sigma_n, device=mu.device)) / torch.sqrt(sigma_n + torch.diag(cov))) if asscalar: ei = ei.detach().numpy().item() return ei
class ExpectedHypervolumeImprovement(MultiObjectiveAnalyticAcquisitionFunction): def __init__( self, model: Model, ref_point: List[float], partitioning: NondominatedPartitioning, objective: Optional[AnalyticMultiOutputObjective] = None, ) -> None: r"""Expected Hypervolume Improvement supporting m>=2 outcomes. This implements the computes EHVI using the algorithm from [Yang2019]_, but additionally computes gradients via auto-differentiation as proposed by [Daulton2020]_. Note: this is currently inefficient in two ways due to the binary partitioning algorithm that we use for the box decomposition: - We have more boxes in our decomposition - If we used a box decomposition that used `inf` as the upper bound for the last dimension *in all hypercells*, then we could reduce the number of terms we need to compute from 2^m to 2^(m-1). [Yang2019]_ do this by using DKLV17 and LKF17 for the box decomposition. TODO: Use DKLV17 and LKF17 for the box decomposition as in [Yang2019]_ for greater efficiency. TODO: Add support for outcome constraints. Example: >>> model = SingleTaskGP(train_X, train_Y) >>> ref_point = [0.0, 0.0] >>> EHVI = ExpectedHypervolumeImprovement(model, ref_point, partitioning) >>> ehvi = EHVI(test_X) Args: model: A fitted model. ref_point: A list with `m` elements representing the reference point (in the outcome space) w.r.t. to which compute the hypervolume. This is a reference point for the objective values (i.e. after applying `objective` to the samples). partitioning: A `NondominatedPartitioning` module that provides the non- dominated front and a partitioning of the non-dominated space in hyper- rectangles. objective: An `AnalyticMultiOutputObjective`. """ # TODO: we could refactor this __init__ logic into a # HypervolumeAcquisitionFunction Mixin if len(ref_point) != partitioning.num_outcomes: raise ValueError( "The length of the reference point must match the number of outcomes. " f"Got ref_point with {len(ref_point)} elements, but expected " f"{partitioning.num_outcomes}." ) ref_point = torch.tensor( ref_point, dtype=partitioning.pareto_Y.dtype, device=partitioning.pareto_Y.device, ) better_than_ref = (partitioning.pareto_Y > ref_point).all(dim=1) if not better_than_ref.any() and partitioning.pareto_Y.shape[0] > 0: raise ValueError( "At least one pareto point must be better than the reference point." ) super().__init__(model=model, objective=objective) self.register_buffer("ref_point", ref_point) self.partitioning = partitioning cell_bounds = self.partitioning.get_hypercell_bounds(ref_point=self.ref_point) self.register_buffer("cell_lower_bounds", cell_bounds[0]) self.register_buffer("cell_upper_bounds", cell_bounds[1]) # create indexing tensor of shape `2^m x m` self._cross_product_indices = torch.tensor( list(product(*[[0, 1] for _ in range(ref_point.shape[0])])), dtype=torch.long, device=ref_point.device, ) self.normal = Normal(0, 1) def psi(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: r"""Compute Psi function. For each cell i and outcome k: Psi(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( sigma_k * PDF((upper_{i,k} - mu_k) / sigma_k) + ( mu_k - lower_{i,k} ) * (1 - CDF(upper_{i,k} - mu_k) / sigma_k) ) See Equation 19 in [Yang2019]_ for more details. Args: lower: A `num_cells x m`-dim tensor of lower cell bounds upper: A `num_cells x m`-dim tensor of upper cell bounds mu: A `batch_shape x 1 x m`-dim tensor of means sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). Returns: A `batch_shape x num_cells x m`-dim tensor of values. """ u = (upper - mu) / sigma return sigma * self.normal.log_prob(u).exp() + (mu - lower) * ( 1 - self.normal.cdf(u) ) def nu(self, lower: Tensor, upper: Tensor, mu: Tensor, sigma: Tensor) -> None: r"""Compute Nu function. For each cell i and outcome k: nu(lower_{i,k}, upper_{i,k}, mu_k, sigma_k) = ( upper_{i,k} - lower_{i,k} ) * (1 - CDF((upper_{i,k} - mu_k) / sigma_k)) See Equation 25 in [Yang2019]_ for more details. Args: lower: A `num_cells x m`-dim tensor of lower cell bounds upper: A `num_cells x m`-dim tensor of upper cell bounds mu: A `batch_shape x 1 x m`-dim tensor of means sigma: A `batch_shape x 1 x m`-dim tensor of standard deviations (clamped). Returns: A `batch_shape x num_cells x m`-dim tensor of values. """ return (upper - lower) * (1 - self.normal.cdf((upper - mu) / sigma)) @t_batch_mode_transform() def forward(self, X: Tensor) -> Tensor: posterior = self.objective(self.model.posterior(X)) mu = posterior.mean sigma = posterior.variance.clamp_min(1e-9).sqrt() # clamp here, since upper_bounds will contain `inf`s, which # are not differentiable cell_upper_bounds = self.cell_upper_bounds.clamp_max( 1e10 if X.dtype == torch.double else 1e8 ) # Compute psi(lower_i, upper_i, mu_i, sigma_i) for i=0, ... m-2 psi_lu = self.psi( lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma ) # Compute psi(lower_m, lower_m, mu_m, sigma_m) psi_ll = self.psi( lower=self.cell_lower_bounds, upper=self.cell_lower_bounds, mu=mu, sigma=sigma, ) # Compute nu(lower_m, upper_m, mu_m, sigma_m) nu = self.nu( lower=self.cell_lower_bounds, upper=cell_upper_bounds, mu=mu, sigma=sigma ) # compute the difference psi_ll - psi_lu psi_diff = psi_ll - psi_lu # this is batch_shape x num_cells x 2 x (m-1) stacked_factors = torch.stack([psi_diff, nu], dim=-2) # Take the cross product of psi_diff and nu across all outcomes # e.g. for m = 2 # for each batch and cell, compute # [psi_diff_0, psi_diff_1] # [nu_0, psi_diff_1] # [psi_diff_0, nu_1] # [nu_0, nu_1] # this tensor has shape: `batch_shape x num_cells x 2^m x m` all_factors_up_to_last = stacked_factors.gather( dim=-2, index=self._cross_product_indices.expand( stacked_factors.shape[:-2] + self._cross_product_indices.shape ), ) # compute product for all 2^m terms, # sum across all terms and hypercells return all_factors_up_to_last.prod(dim=-1).sum(dim=-1).sum(dim=-1)
normal.log_prob(value=torch.Tensor([-1, 0, .5])), "\n") print("log-likelihood given value with (2,3):\n", normal.log_prob(value=torch.Tensor([[-1, 0, .5], [-2, 1, 3]]))) print("log-probability given value with shape ():\n", binomial.log_prob(value=torch.Tensor([5])), "\n") print("log-probability given value with (3,):\n", binomial.log_prob(value=torch.Tensor([5, 3, 7])), "\n") print("log-probability given value with (2,3):\n", binomial.log_prob(value=torch.Tensor([[5, 3, 7], [2, 0, 10]]))) 在給定上界之數值,常態分配之`.cdf()` 可用於計算該上界數值所對應之累積機率數值 print("cumulative probability given value with shape ():\n", normal.cdf(value=torch.Tensor([0])), "\n") print("cumulative probability given value with (3,):\n", normal.cdf(value=torch.Tensor([-1, 0, .5])), "\n") print("cumulative probability given value with (2,3):\n", normal.cdf(value=torch.Tensor([[-1, 0, .5], [-2, 1, 3]]))) 不過,binomial分配並無 `cdf()` 方法可評估累積機率值。 ### 分配物件之形狀 `pytorch` 分配物件之設計,乃參考 `tensorflow_probability`此套件,而分配物件在形狀上,牽涉到三類型之形狀: 1. 樣本形狀(sample shape):為用於描述獨立且具有相同分配隨機樣本之形狀,先前產生隨機樣本時,所設定的 `sample_shape` 即為樣本形狀。 2. 批次形狀(batch shape):為用於描述獨立,但不具有相同分配隨機樣本之形狀,其可以透過模型參數之形狀進行設定。 3. 事件形狀(event shape):為用於描述多變量分配之形狀,各變數間可能不具有統計獨立之特性。 先前產生的常態分配,其在 `batch_shape` 與 `event_shape` 上,皆為純量,故其數值為0-d之張量。
def normal_cdf(loc, sd): """normal cdf(0)""" # it is not jit-able d = Normal(loc, sd) return d.cdf(0)
def plot(self, axes=None, block=False, Ndiv=100, legend=True, title="GPgrad", plotting=True, plotCDF=False, clear_axes=False, Nsamples=None, ylabel=None, ylim=None, pause=None, showtickslabels_x=True, xlabel=None, labelsize=None, showtickslabels=None, showticks=None, linewidth=None, color=None, prob=False): ''' This function hardcodes the plotting limits between zero and one for now ''' if plotting == False or self.dim > 1: return pp = PlotProbability() xpred_vec = torch.linspace(0.0, 1.0, Ndiv)[:, None] # xpred_vec = xpred_vec.unsqueeze(0) # Ndiv batches of [q=1 x self.dim] dimensions each # Compute one by one: logger.info("Computing posterior while plotting ... (!!)") post_batch = False if post_batch: # Predict: posterior = self.posterior( X=xpred_vec, observation_noise=False ) # observation_noise MUST be always false; this class is not prepared otherwise # Internally, self.posterior(xpred_vec) calls self(xpred_vec), which calls self.predictive(xpred_vec) # pdb.set_trace() # Get upper and lower confidence bounds (2 standard deviations from the mean): lower_ci, upper_ci = posterior.mvn.confidence_region() # Posterior mean: mean_vec = posterior.mean std_vec = posterior.variance.sqrt() else: lower_ci = torch.zeros((Ndiv)) upper_ci = torch.zeros((Ndiv)) mean_vec = torch.zeros((Ndiv)) std_vec = torch.zeros((Ndiv)) for k in range(Ndiv): mvn = self.predictive(xpred_vec[k, :].view(-1, self.dim)) lower_ci[k], upper_ci[k] = mvn.confidence_region() mean_vec[k] = mvn.mean std_vec[k] = mvn.variance.sqrt() if self.dim == 1: if prob == False: axes = pp.plot_GP_1D( xpred_vec=xpred_vec.squeeze().cpu().numpy(), fpred_mode_vec=mean_vec.squeeze().detach().cpu().numpy(), fpred_quan_minus=lower_ci.squeeze().detach().cpu().numpy(), fpred_quan_plus=upper_ci.squeeze().detach().cpu().numpy(), X_uns=self.train_xu.detach().cpu().numpy(), X_sta=self.train_xs.detach().cpu().numpy(), Y_sta=self.train_ys.detach().cpu().numpy(), title=title, axes=axes, block=block, legend=legend, clear_axes=True, xlabel=None, ylabel=ylabel, xlim=np.array([0., 1.]), ylim=ylim, labelsize="x-large", legend_loc="upper left", colormap="paper", showtickslabels_x=showtickslabels_x) else: normal = Normal( loc=mean_vec.squeeze(), # scale=posterior.variance.sqrt().squeeze()) scale=std_vec.squeeze()) ei_cdf = normal.cdf(self.threshold) # pdb.set_trace() axes = pp.plot_acquisition_function( var_vec=ei_cdf, xpred_vec=xpred_vec.cpu().numpy(), xlabel=xlabel, ylabel=ylabel, title=title, legend=legend, axes=axes, clear_axes=True, xlim=np.array([0., 1.]), block=block, labelsize=labelsize, showtickslabels=showtickslabels, showticks=showticks, what2plot="", color=color, ylim=np.array([0., 1.1]), linewidth=linewidth) if Nsamples is not None: f_sample = posterior.sample( sample_shape=torch.Size([Nsamples])) for k in range(Nsamples): axes.plot(xpred_vec.squeeze().detach().cpu().numpy(), f_sample[k, :, 0], linestyle="--", linewidth=1.0, color="sienna") elif self.dim == 2: pass plt.show(block=block) if pause is not None: plt.pause(pause) return axes
ub = 4 lower_bound = torch.zeros(n_samples) + torch.Tensor([lb]) upper_bound = torch.zeros(n_samples) + torch.Tensor([ub]) samples = trandn((lower_bound - mus) / stds, (upper_bound - mus) / stds) samples = samples * stds + mus mean = samples.mean() norm = Normal(0, 1) alpha = -mus / stds t = time.time() for _ in range(300): alpha_log_pdf = norm.log_prob(alpha) print("log_prob time:", time.time() - t) alpha_pdf = torch.exp(alpha_log_pdf) Z = 1 - norm.cdf(alpha) theoretical_mean = mu + std * (alpha_pdf / Z) t = time.time() for _ in range(1): logZhat, Zhat, muHat, sigmaHat, entropy = moments( lower_bound, upper_bound, torch.Tensor([mu]).expand_as(lower_bound), torch.Tensor([std**2]).expand_as(lower_bound)) print("Robust time:", time.time() - t) print("=============================\n\n") print( f"Estimated mean: {mean}\nTheoretical mean: {theoretical_mean[0]}\nRobust evaluation of mean: {muHat.tolist()[0]}" ) print("=============================\n\n")
class NormalUniform(Distribution): """ A mixture of a Normal distribution and a Uniform distribution, defined over the interval -1 to 1. Whatever probability mass left over from the Normal distribution (outside -1 to 1) is converted into a Uniform. """ arg_constraints = {'loc': constraints.real, 'scale': constraints.positive} support = constraints.interval(-1., 1.) has_rsample = False def __init__(self, loc, scale, validate_args=None): loc = torch.tanh(loc) self.loc, self.scale = broadcast_all(loc, scale) if isinstance(loc, Number) and isinstance(scale, Number): batch_shape = torch.Size() else: batch_shape = self.loc.size() super(NormalUniform, self).__init__(batch_shape, validate_args=validate_args) self.normal = Normal(self.loc, self.scale) dev = self.normal.loc.device self.low = -torch.ones(batch_shape, device=dev) self.high = torch.ones(batch_shape, device=dev) self.uniform = Uniform(self.low, self.high) normal_prob = self.normal.cdf(torch.ones( batch_shape, device=dev)) - self.normal.cdf( -torch.ones(batch_shape, device=dev)) self.uniform_factor = 1 - normal_prob def log_prob(self, value): normal_prob = self.normal.log_prob(value).exp() uniform_prob = self.uniform_factor * self.uniform.log_prob(value).exp() return (normal_prob + uniform_prob + torch.finfo(value.dtype).eps).log() def sample(self, sample_shape=torch.Size()): shape = self._extended_shape(sample_shape) normal_sample = self.normal.sample(sample_shape) uniform_sample = self.uniform.sample(sample_shape) # check for places where the normal sample is outside [-1, 1] # and replace with a uniform sample dist_flag = ((normal_sample > 1) + (normal_sample < -1)).float() sample = (1. - dist_flag) * normal_sample + dist_flag * uniform_sample return sample def expand(self, batch_shape, _instance=None): new = self._get_checked_instance(NormalUniform, _instance) batch_shape = torch.Size(batch_shape) new.loc = self.loc.expand(batch_shape) new.scale = self.scale.expand(batch_shape) new.normal = Normal(new.loc, new.scale) new.low = self.low.expand(batch_shape) new.high = self.high.expand(batch_shape) new.uniform = Uniform(new.low, new.high) dev = new.normal.loc.device normal_prob = new.normal.cdf(torch.ones( batch_shape, device=dev)) - new.normal.cdf(-torch.ones(batch_shape, device=dev)) new.uniform_factor = 1 - normal_prob super(NormalUniform, new).__init__(new.low, new.scale, validate_args=False) new._validate_args = self._validate_args return new def cdf(self, value): raise NotImplementedError def icdf(self, value): raise NotImplementedError def entropy(self): raise NotImplementedError
class MoE(nn.Module): """Call a Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts. Args: input_size: integer - size of the input output_size: integer - size of the input num_experts: an integer - number of experts hidden_size: an integer - hidden size of the experts noisy_gating: a boolean k: an integer - how many experts to use for each batch element """ def __init__(self, input_size, hidden_size, latent_dim, output_size, num_experts, num_blocks=3, noisy_gating=True, k=4): super(MoE, self).__init__() self.noisy_gating = noisy_gating self.num_experts = num_experts self.output_size = output_size self.input_size = input_size self.latent_size = latent_dim self.hidden_size = hidden_size self.k = k action_size = output_size input_size = input_size - action_size # Remove the action masking from the input to match sizes properly self.encoder = ResNet(input_size=input_size, hidden_size=hidden_size, output_size=latent_dim, num_blocks=num_blocks) # instantiate experts self.experts = nn.ModuleList([ ResNet(input_size=latent_dim, hidden_size=hidden_size, output_size=output_size, num_blocks=num_blocks) for i in range(self.num_experts) ]) self.value = ResNet(input_size=input_size, hidden_size=hidden_size, output_size=1, num_blocks=num_blocks) self.w_gate = nn.Parameter(torch.zeros(latent_dim, num_experts), requires_grad=True) self.w_noise = nn.Parameter(torch.zeros(latent_dim, num_experts), requires_grad=True) self.softplus = nn.Softplus() self.softmax = nn.Softmax(1) self.normal = Normal(torch.tensor([0.0]), torch.tensor([1.0])) assert (self.k <= self.num_experts) def cv_squared(self, x): """The squared coefficient of variation of a sample. Useful as a loss to encourage a positive distribution to be more uniform. Epsilons added for numerical stability. Returns 0 for an empty Tensor. Args: x: a `Tensor`. Returns: a `Scalar`. """ eps = 1e-10 # if only num_experts = 1 if x.shape[0] == 1: return torch.Tensor([0]) return x.float().var() / (x.float().mean()**2 + eps) def _gates_to_load(self, gates): """Compute the true load per expert, given the gates. The load is the number of examples for which the corresponding gate is >0. Args: gates: a `Tensor` of shape [batch_size, n] Returns: a float32 `Tensor` of shape [n] """ return (gates > 0).sum(0) def _prob_in_top_k(self, clean_values, noisy_values, noise_stddev, noisy_top_values): """Helper function to NoisyTopKGating. Computes the probability that value is in top k, given different random noise. This gives us a way of backpropagating from a loss that balances the number of times each expert is in the top k experts per example. In the case of no noise, pass in None for noise_stddev, and the result will not be differentiable. Args: clean_values: a `Tensor` of shape [batch, n]. noisy_values: a `Tensor` of shape [batch, n]. Equal to clean values plus normally distributed noise with standard deviation noise_stddev. noise_stddev: a `Tensor` of shape [batch, n], or None noisy_top_values: a `Tensor` of shape [batch, m]. "values" Output of tf.top_k(noisy_top_values, m). m >= k+1 Returns: a `Tensor` of shape [batch, n]. """ batch = clean_values.size(0) m = noisy_top_values.size(1) top_values_flat = noisy_top_values.flatten() threshold_positions_if_in = torch.arange(batch) * m + self.k threshold_if_in = torch.unsqueeze( torch.gather(top_values_flat, 0, threshold_positions_if_in), 1) is_in = torch.gt(noisy_values, threshold_if_in) threshold_positions_if_out = threshold_positions_if_in - 1 threshold_if_out = torch.unsqueeze( torch.gather(top_values_flat, 0, threshold_positions_if_out), 1) # is each value currently in the top k. prob_if_in = self.normal.cdf( (clean_values - threshold_if_in) / noise_stddev) prob_if_out = self.normal.cdf( (clean_values - threshold_if_out) / noise_stddev) prob = torch.where(is_in, prob_if_in, prob_if_out) return prob def noisy_top_k_gating(self, x, train, noise_epsilon=1e-2): """Noisy top-k gating. See paper: https://arxiv.org/abs/1701.06538. Args: x: input Tensor with shape [batch_size, input_size] train: a boolean - we only add noise at training time. noise_epsilon: a float Returns: gates: a Tensor with shape [batch_size, num_experts] load: a Tensor with shape [num_experts] """ clean_logits = x @ self.w_gate if self.noisy_gating: raw_noise_stddev = x @ self.w_noise noise_stddev = ((self.softplus(raw_noise_stddev) + noise_epsilon) * train) noisy_logits = clean_logits + (torch.randn_like(clean_logits) * noise_stddev) logits = noisy_logits else: logits = clean_logits # calculate topk + 1 that will be needed for the noisy gates top_logits, top_indices = logits.topk(min(self.k + 1, self.num_experts), dim=1) top_k_logits = top_logits[:, :self.k] top_k_indices = top_indices[:, :self.k] top_k_gates = self.softmax(top_k_logits) zeros = torch.zeros_like(logits, requires_grad=True) gates = zeros.scatter(1, top_k_indices, top_k_gates) if self.noisy_gating and self.k < self.num_experts: load = (self._prob_in_top_k(clean_logits, noisy_logits, noise_stddev, top_logits)).sum(0) else: load = self._gates_to_load(gates) return gates, load def forward(self, observation, prev_action, prev_reward): """Args: x: tensor shape [batch_size, input_size] train: a boolean scalar. loss_coef: a scalar - multiplier on load-balancing losses Returns: y: a tensor with shape [batch_size, output_size]. extra_training_loss: a scalar. This should be added into the overall training loss of the model. The backpropagation of this loss encourages all experts to be approximately equally used across a batch. """ train = self.training observation = observation.float() # Infer (presence of) leading dimensions: [T,B], [B], or []. lead_dim, T, B, obs_shape = infer_leading_dims(observation, 1) observation = observation.view(T * B, *obs_shape) action_mask = observation[:, -19:].type(torch.bool) observation = observation[:, :-19] z = self.encoder(observation) gates, load = self.noisy_top_k_gating(z, train) dispatcher = SparseDispatcher(self.num_experts, gates) expert_inputs = dispatcher.dispatch(z) gates = dispatcher.expert_to_gates() expert_outputs = [ self.experts[i](expert_inputs[i]) for i in range(self.num_experts) ] y = dispatcher.combine(expert_outputs) value = self.value(observation).squeeze(-1) y[~action_mask] = -1e24 y = nn.functional.softmax(y, dim=-1) y, value = restore_leading_dims((y, value), lead_dim, T, B) return y, value def loss(self, observation, prev_action, prev_reward, loss_coef=1e-1): train = self.training observation = observation.float() lead_dim, T, B, obs_shape = infer_leading_dims(observation, 1) observation = observation.view(T * B, *obs_shape) action_mask = observation[:, -19:].type(torch.bool) observation = observation[:, :-19] z = self.encoder(observation) gates, load = self.noisy_top_k_gating(z, train) # calculate importance loss importance = gates.sum(0) loss = self.cv_squared(importance) + self.cv_squared(load) loss *= loss_coef return loss
class TanhNormal(Distribution): """ Represent distribution of X where X ~ tanh(Z) Z ~ N(mean, std) Note: this is not very numerically stable. """ def __init__(self, normal_mean, normal_std, epsilon=1e-6): """ Args: normal_mean (Tensor): Mean of the normal distribution normal_std (Tensor): Std of the normal distribution epsilon (Double): Numerical stability epsilon when computing log-prob. """ super(TanhNormal, self).__init__() self._normal_mean = normal_mean self._normal_std = normal_std self._normal = Normal(normal_mean, normal_std) self._epsilon = epsilon @property def mean(self): return self._normal.mean @property def variance(self): return self._normal.variance @property def stddev(self): return self._normal.stddev @property def epsilon(self): return self._epsilon def sample(self, return_pretanh_value=False): # z = self._normal.sample() z = self._normal.sample().detach() if return_pretanh_value: return torch.tanh(z), z else: return torch.tanh(z) def rsample(self, return_pretanh_value=False): z = self._normal.rsample() # z = ( # self._normal_mean + # self._normal_std * # Normal( # ptu.zeros(self._normal_mean.size()), # ptu.ones(self._normal_std.size()), # ).sample() # ) if return_pretanh_value: return torch.tanh(z), z else: return torch.tanh(z) def sample_n(self, n, return_pre_tanh_value=False): z = self._normal.sample_n(n) if return_pre_tanh_value: return torch.tanh(z), z else: return torch.tanh(z) def log_prob(self, value, pre_tanh_value=None): """ Returns the log of the probability density function evaluated at `value`. Args: value (Tensor): pre_tanh_value (Tensor): arctan(value) Returns: log_prob (Tensor) """ if pre_tanh_value is None: pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 return self._normal.log_prob(pre_tanh_value) - \ torch.log(1. - value * value + self._epsilon) # return self.normal.log_prob(pre_tanh_value) - \ # torch.log(1. - torch.tanh(pre_tanh_value)**2 + self._epsilon) def cdf(self, value, pre_tanh_value=None): if pre_tanh_value is None: pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2 return self._normal.cdf(pre_tanh_value)