def gaussian_frobenius(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], scale_prec: bool = False, return_cov: bool = False) \ -> Union[Tuple[ch.Tensor, ch.Tensor], Tuple[ch.Tensor, ch.Tensor, ch.Tensor, ch.Tensor]]: """ Compute (p - q) (L_oL_o^T)^-1 (p - 1)^T + |LL^T - L_oL_o^T|_F^2 with p,q ~ N(y, LL^T) Args: policy: current policy p: mean and chol of gaussian p q: mean and chol of gaussian q return_cov: return cov matrices for further computations scale_prec: scale objective with precision matrix Returns: mahalanobis distance, squared frobenius norm """ mean, chol = p mean_other, chol_other = q mean_part = mean_distance(policy, mean, mean_other, chol_other, scale_prec) # frob objective for cov cov_other = policy.covariance(chol_other) cov = policy.covariance(chol) diff = cov_other - cov # Matrix is real symmetric PSD, therefore |A @ A^H|^2_F = tr{A @ A^H} = tr{A @ A} cov_part = torch_batched_trace(diff @ diff) if return_cov: return mean_part, cov_part, cov, cov_other return mean_part, cov_part
def gaussian_kl(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor]) -> Tuple[ch.Tensor, ch.Tensor]: """ Get the expected KL divergence between two sets of Gaussians over states - Calculates E KL(p||q): E[sum p(x) log(p(x)/q(x))] in closed form for Gaussians. Args: policy: policy instance p: first distribution tuple (mean, var) q: second distribution tuple (mean, var) Returns: """ mean, std = p mean_other, std_other = q k = mean.shape[-1] det_term = policy.log_determinant(std) det_term_other = policy.log_determinant(std_other) cov = policy.covariance(std) prec_other = policy.precision(std_other) maha_part = .5 * policy.maha(mean, mean_other, std_other) # trace_part = (var * precision_other).sum([-1, -2]) trace_part = torch_batched_trace(prec_other @ cov) cov_part = .5 * (trace_part - k + det_term_other - det_term) return maha_part, cov_part
def gaussian_wasserstein_non_commutative(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], scale_prec=False, return_eig=False) -> Union[Tuple[ch.Tensor, ch.Tensor], Tuple[ch.Tensor, ch.Tensor, ch.Tensor, ch.Tensor]]: """ Compute mean part and cov part of W_2(p || q) with p,q ~ N(y, SS) This version DOES NOT assume commutativity of both distributions, i.e. covariance matrices. This is more general an does not make any assumptions. When scale_prec is true scale both distributions with old precision matrix. Args: policy: current policy p: mean and sqrt of gaussian p q: mean and sqrt of gaussian q scale_prec: scale objective by old precision matrix. This penalizes directions based on old uncertainty/covariance. return_eig: return eigen decomp for further computation Returns: mean part of W2, cov part of W2 """ mean, sqrt = p mean_other, sqrt_other = q batch_dim, dim = mean.shape mean_part = mean_distance(policy, mean, mean_other, sqrt_other, scale_prec) cov = policy.covariance(sqrt) if scale_prec: # cov constraint scaled with precision of old dist # W2 objective for cov assuming normal W2 objective for mean identity = ch.eye(dim, dtype=sqrt.dtype, device=sqrt.device) sqrt_inv_other = ch.solve(identity, sqrt_other)[0] c = sqrt_inv_other @ cov @ sqrt_inv_other # compute inner parenthesis of trace in W2, # Only consider lower triangular parts, given cov/sqrt(cov) is symmetric PSD. eigvals, eigvecs = ch.symeig(c, eigenvectors=return_eig, upper=False) # make use of the following property to compute the trace of the root: 𝐴^2𝑥=𝐴(𝐴𝑥)=𝐴𝜆𝑥=𝜆(𝐴𝑥)=𝜆^2𝑥 cov_part = torch_batched_trace(identity + c) - 2 * eigvals.sqrt().sum(1) else: # W2 objective for cov assuming normal W2 objective for mean cov_other = policy.covariance(sqrt_other) # compute inner parenthesis of trace in W2, # Only consider lower triangular parts, given cov/sqrt(cov) is symmetric PSD. eigvals, eigvecs = ch.symeig(cov @ cov_other, eigenvectors=return_eig, upper=False) # make use of the following property to compute the trace of the root: 𝐴^2𝑥=𝐴(𝐴𝑥)=𝐴𝜆𝑥=𝜆(𝐴𝑥)=𝜆^2𝑥 cov_part = torch_batched_trace(cov_other + cov) - 2 * eigvals.sqrt().sum(1) if return_eig: return mean_part, cov_part, eigvals, eigvecs return mean_part, cov_part
def constraint_values(proj_type, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], scale_prec: bool = True): """ Computes the relevant metrics for a given batch of predictions. Args: proj_type: type of projection to compute the metrics for policy: current policy p: mean and std of gaussian p q: mean and std of gaussian q scale_prec: for W2 projection, use version scaled with precision matrix Returns: entropy, mean_part, cov_part, kl """ if proj_type == "w2": mean_part, cov_part = gaussian_wasserstein_commutative(policy, p, q, scale_prec=scale_prec) elif proj_type == "w2_non_com": # For this case only the sum is relevant, no individual projections for mean and std make sense mean_part, cov_part = gaussian_wasserstein_non_commutative(policy, p, q, scale_prec=scale_prec) elif proj_type == "frob": mean_part, cov_part = gaussian_frobenius(policy, p, q, scale_prec=scale_prec) else: # we assume kl projection as default (this is also true for PPO) mean_part, cov_part = gaussian_kl(policy, p, q) entropy = policy.entropy(p) mean_kl, cov_kl = gaussian_kl(policy, p, q) kl = mean_kl + cov_kl return entropy, mean_part, cov_part, kl
def entropy_inequality_projection(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], beta: Union[float, ch.Tensor]): """ Projects std to satisfy an entropy INEQUALITY constraint. Args: policy: policy instance p: current distribution beta: target entropy for EACH std or general bound for all stds Returns: projected std that satisfies the entropy bound """ mean, std = p k = std.shape[-1] batch_shape = std.shape[:-2] ent = policy.entropy(p) mask = ent < beta # if nothing has to be projected skip computation if (~mask).all(): return p alpha = ch.ones(batch_shape, dtype=std.dtype, device=std.device) alpha[mask] = ch.exp((beta[mask] - ent[mask]) / k) proj_std = ch.einsum('ijk,i->ijk', std, alpha) return mean, ch.where(mask[..., None, None], proj_std, std)
def evaluate_policy(self, policy: AbstractGaussianPolicy, render: bool = False, deterministic: bool = True): """ Evaluate a given policy Args: policy: policy to evaluate render: render policy behavior deterministic: choosing deterministic actions Returns: Dict with performance metrics. """ if self.n_test_envs == 0: return n_runs = 1 ep_rewards = np.zeros(( n_runs, self.n_test_envs, )) ep_lengths = np.zeros(( n_runs, self.n_test_envs, )) for i in range(n_runs): not_dones = np.ones((self.n_test_envs, ), np.bool) obs = self.envs.reset_test() while np.any(not_dones): ep_lengths[i, not_dones] += 1 if render: self.envs.render_test(mode="human") with ch.no_grad(): p = policy(tensorize(obs, self.cpu, self.dtype)) actions = p[0] if deterministic else policy.sample(p) actions = policy.squash(actions) obs, rews, dones, infos = self.envs.step_test( get_numpy(actions)) ep_rewards[i, not_dones] += rews[not_dones] # only set to False when env has never terminated before, otherwise we favor earlier terminating envs. not_dones = np.logical_and(~dones, not_dones) return self.get_reward_dict(ep_rewards, ep_lengths)
def _trust_region_projection(self, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], eps: ch.Tensor, eps_cov: ch.Tensor, **kwargs): """ Runs KL projection layer and constructs cholesky of covariance Args: policy: policy instance p: current distribution q: old distribution eps: (modified) kl bound/ kl bound for mean part eps_cov: (modified) kl bound for cov part **kwargs: Returns: projected mean, projected cov cholesky """ mean, std = p old_mean, old_std = q if not policy.contextual_std: # only project first one to reduce number of numerical optimizations std = std[:1] old_std = old_std[:1] ################################################################################################################ # project mean with closed form mean_part, _ = gaussian_kl(policy, p, q) proj_mean = mean_projection(mean, old_mean, mean_part, eps) cov = policy.covariance(std) old_cov = policy.covariance(old_std) if policy.is_diag: proj_cov = KLProjectionGradFunctionDiagCovOnly.apply(cov.diagonal(dim1=-2, dim2=-1), old_cov.diagonal(dim1=-2, dim2=-1), eps_cov) proj_std = proj_cov.sqrt().diag_embed() else: raise NotImplementedError("The KL projection currently does not support full covariance matrices.") if not policy.contextual_std: # scale first std back to batchsize proj_std = proj_std.expand(mean.shape[0], -1, -1) return proj_mean, proj_std
def gaussian_wasserstein_commutative(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], scale_prec=False) -> Tuple[ch.Tensor, ch.Tensor]: """ Compute mean part and cov part of W_2(p || q) with p,q ~ N(y, SS). This version DOES assume commutativity of both distributions, i.e. covariance matrices. This is less general and assumes both distributions are somewhat close together. When scale_prec is true scale both distributions with old precision matrix. Args: policy: current policy p: mean and sqrt of gaussian p q: mean and sqrt of gaussian q scale_prec: scale objective by old precision matrix. This penalizes directions based on old uncertainty/covariance. Returns: mean part of W2, cov part of W2 """ mean, sqrt = p mean_other, sqrt_other = q mean_part = mean_distance(policy, mean, mean_other, sqrt_other, scale_prec) cov = policy.covariance(sqrt) if scale_prec: # cov constraint scaled with precision of old dist batch_dim, dim = mean.shape identity = ch.eye(dim, dtype=sqrt.dtype, device=sqrt.device) sqrt_inv_other = ch.solve(identity, sqrt_other)[0] c = sqrt_inv_other @ cov @ sqrt_inv_other cov_part = torch_batched_trace(identity + c - 2 * sqrt_inv_other @ sqrt) else: # W2 objective for cov assuming normal W2 objective for mean cov_other = policy.covariance(sqrt_other) cov_part = torch_batched_trace(cov_other + cov - 2 * sqrt_other @ sqrt) return mean_part, cov_part
def entropy_equality_projection(policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], beta: Union[float, ch.Tensor]): """ Projects std to satisfy an entropy EQUALITY constraint. Args: policy: policy instance p: current distribution beta: target entropy for EACH std or general bound for all stds Returns: projected std that satisfies the entropy bound """ mean, std = p k = std.shape[-1] ent = policy.entropy(p) alpha = ch.exp((beta - ent) / k) proj_std = ch.einsum('ijk,i->ijk', std, alpha) return mean, proj_std
def run(self, rollout_steps, policy: AbstractGaussianPolicy, vf_model: Union[VFNet, None] = None, reset_envs: bool = False) -> TrajectoryOnPolicyRaw: """ Generate trajectories of the environment. Args: rollout_steps: Number of steps to generate policy: Policy model to generate samples for vf_model: vf model to generate value estimate for all states. reset_envs: Whether to reset all envs in the beginning. Returns: Trajectory with the respective data as torch tensors. """ # Here, we init the lists that will contain the mb of experiences num_envs = self.n_envs base_shape = (rollout_steps, num_envs) base_shape_p1 = (rollout_steps + 1, num_envs) base_action_shape = base_shape + self.envs.action_space.shape mb_obs = ch.zeros(base_shape_p1 + self.envs.observation_space.shape, dtype=self.dtype) mb_actions = ch.zeros(base_action_shape, dtype=self.dtype) mb_rewards = ch.zeros(base_shape, dtype=self.dtype) mb_dones = ch.zeros(base_shape, dtype=ch.bool) ep_infos = [] mb_time_limit_dones = ch.zeros(base_shape, dtype=ch.bool) mb_means = ch.zeros(base_action_shape, dtype=self.dtype) mb_stds = ch.zeros(base_action_shape + self.envs.action_space.shape, dtype=self.dtype) # continue from last state # Before first step we already have self.obs because env calls self.obs = env.reset() on init obs = self.envs.reset() if reset_envs else self.envs.last_obs obs = tensorize(obs, self.cpu, self.dtype) # For n in range number of steps for i in range(rollout_steps): # Given observations, get action value and lopacs pds = policy(obs, train=False) actions = policy.sample(pds) squashed_actions = policy.squash(actions) mb_obs[i] = obs mb_actions[i] = squashed_actions obs, rewards, dones, infos = self.envs.step( squashed_actions.cpu().numpy()) obs = tensorize(obs, self.cpu, self.dtype) mb_means[i] = pds[0] mb_stds[i] = pds[1] mb_time_limit_dones[i] = tensorize(infos["horizon"], self.cpu, ch.bool) if infos.get("done"): ep_infos.extend(infos.get("done")) mb_rewards[i] = tensorize(rewards, self.cpu, self.dtype) mb_dones[i] = tensorize(dones, self.cpu, ch.bool) # need value prediction for last obs in rollout to estimate loss mb_obs[-1] = obs # compute all logpacs and value estimates at once --> less computation mb_logpacs = policy.log_probability((mb_means, mb_stds), mb_actions) mb_values = (vf_model if vf_model else policy.get_value)(mb_obs, train=False) out = (mb_obs[:-1], mb_actions, mb_logpacs, mb_rewards, mb_values, mb_dones, mb_time_limit_dones, mb_means, mb_stds) if not self.cpu: out = tuple(map(to_gpu, out)) if ep_infos: ep_infos = np.array(ep_infos) ep_length, ep_reward = ep_infos[:, 0], ep_infos[:, 1] self.total_rewards.extend(ep_reward) self.total_steps.extend(ep_length) return TrajectoryOnPolicyRaw(*out)
def _trust_region_projection(self, policy: AbstractGaussianPolicy, p: Tuple[ch.Tensor, ch.Tensor], q: Tuple[ch.Tensor, ch.Tensor], eps: Union[ch.Tensor, float], eps_cov: Union[ch.Tensor, float], **kwargs): """ runs papi projection layer and constructs sqrt of covariance Args: policy: policy instance p: current distribution q: old distribution eps: (modified) kl bound/ kl bound for mean part eps_cov: (modified) kl bound for cov part **kwargs: Returns: mean, cov sqrt """ mean, chol = p old_mean, old_chol = q intermed_mean = kwargs.get('intermed_mean') dtype = mean.dtype device = mean.device dim = mean.shape[-1] ################################################################################################################ # Precompute basic matrices # Joint bound eps += eps_cov I = ch.eye(dim, dtype=dtype, device=device) old_precision = ch.cholesky_solve(I, old_chol)[0] logdet_old = policy.log_determinant(old_chol) cov = policy.covariance(chol) ################################################################################################################ # compute expected KL maha_part, cov_part = gaussian_kl(policy, p, q) maha_part = maha_part.mean() cov_part = cov_part.mean() if intermed_mean is not None: maha_intermediate = 0.5 * policy.maha(intermed_mean, old_mean, old_chol).mean() mm = ch.min(maha_part, maha_intermediate) ################################################################################################################ # matrix rotation/rescaling projection if maha_part + cov_part > eps + 1e-6: old_cov = policy.covariance(old_chol) maha_delta = eps if intermed_mean is None else (eps - mm) eta_rot = maha_delta / ch.max( maha_part + cov_part, ch.tensor(1e-16, dtype=dtype, device=device)) new_cov = (1 - eta_rot) * old_cov + eta_rot * cov proj_chol = ch.cholesky(new_cov) # recompute covariance part of KL for new chol trace_term = 0.5 * (torch_batched_trace(old_precision @ new_cov) - dim).mean() # rotation difference entropy_diff = 0.5 * (logdet_old - policy.log_determinant(proj_chol)).mean() cov_part = trace_term + entropy_diff else: proj_chol = chol ################################################################################################################ # mean interpolation projection if maha_part + cov_part > eps + 1e-6: if intermed_mean is not None: a = 0.5 * policy.maha(mean, intermed_mean, old_chol).mean() b = 0.5 * ((mean - intermed_mean) @ old_precision @ (intermed_mean - old_mean).T).mean() c = maha_intermediate - ch.max( eps - cov_part, ch.tensor(0., dtype=dtype, device=device)) eta_mean = (-b + ch.sqrt(ch.max(b * b - a * c, ch.tensor(1e-16, dtype=dtype, device=device)))) / \ ch.max(a, ch.tensor(1e-16, dtype=dtype, device=device)) else: eta_mean = ch.sqrt( ch.max(eps - cov_part, ch.tensor(1e-16, dtype=dtype, device=device)) / ch.max(maha_part, ch.tensor(1e-16, dtype=dtype, device=device))) else: eta_mean = ch.tensor(1., dtype=dtype, device=device) return eta_mean, proj_chol
def _papi_steps(self, policy: AbstractGaussianPolicy, q: Tuple[ch.Tensor, ch.Tensor], obs: ch.Tensor, lr_schedule, lr_schedule_vf=None): """ Take PAPI steps after PPO finished its steps. Policy parameters are updated in-place. Args: policy: policy instance q: old distribution obs: collected observations from trajectories lr_schedule: lr schedule for policy lr_schedule_vf: lr schedule for vf Returns: """ assert not policy.contextual_std # save latest policy in history self.last_policies.append(copy.deepcopy(policy)) ################################################################################################################ # policy backtracking: out of last n policies and current one find one that satisfies the kl constraint intermed_policy = None n_backtracks = 0 for i, pi in enumerate(reversed(self.last_policies)): p_prime = pi(obs) mean_part, cov_part = pi.kl_divergence(p_prime, q) if (mean_part + cov_part).mean() <= self.mean_bound + self.cov_bound: intermed_policy = pi n_backtracks = i break ################################################################################################################ # LR update # reduce learning rate when appropriate policy not within the last 4 epochs if n_backtracks >= 4 or intermed_policy is None: # Linear learning rate annealing lr_schedule.step() if lr_schedule_vf: lr_schedule_vf.step() if intermed_policy is None: # pop last policy and make it current one, as the updated one was poor # do not keep last policy in history, otherwise we could stack the same policy multiple times. if len(self.last_policies) >= 1: policy.load_state_dict(self.last_policies.pop().state_dict()) logger.warning( f"No suitable policy found in backtracking of {len(self.last_policies)} policies." ) return ################################################################################################################ # PAPI iterations # We assume only non contextual covariances here, therefore we only need to project for one q = (q[0], q[1][:1]) # (means, covs[:1]) # This is A from Alg. 2 [Akrour et al., 2019] intermed_weight = intermed_policy.get_last_layer().detach().clone() # This is A @ phi(s) intermed_mean = p_prime[0].detach().clone() entropy = policy.entropy(q) entropy_bound = obs.new_tensor([-np.inf]) if entropy / self.initial_entropy > 0.5 \ else entropy - (self.mean_bound + self.cov_bound) for _ in range(20): eta, proj_chol = self._projection(intermed_policy, (p_prime[0], p_prime[1][:1]), q, self.mean_bound, self.cov_bound, entropy_bound, intermed_mean=intermed_mean) intermed_policy.papi_weight_update(eta, intermed_weight) intermed_policy.set_std(proj_chol[0]) p_prime = intermed_policy(obs) policy.load_state_dict(intermed_policy.state_dict())
def trust_region_regression(self, policy: AbstractGaussianPolicy, obs: ch.Tensor, q: Tuple[ch.Tensor, ch.Tensor], n_minibatches: int, global_steps: int): """ Take additional regression steps to match projection output and policy output. The policy parameters are updated in-place. Args: policy: policy instance obs: collected observations from trajectories q: old distributions n_minibatches: split the rollouts into n_minibatches. global_steps: current number of steps, required for projection Returns: dict with mean of regession loss """ if not self.do_regression: return {} policy_unprojected = copy.deepcopy(policy) optim_reg = get_optimizer(self.optimizer_type_reg, policy_unprojected.parameters(), learning_rate=self.lr_reg) optim_reg.reset() reg_losses = obs.new_tensor(0.) # get current projected values --> targets for regression p_flat = policy(obs) p_target = self(policy, p_flat, q, global_steps) for _ in range(self.regression_iters): batch_indices = generate_minibatches(obs.shape[0], n_minibatches) # Minibatches SGD for indices in batch_indices: batch = select_batch(indices, obs, p_target[0], p_target[1]) b_obs, b_target_mean, b_target_std = batch proj_p = (b_target_mean.detach(), b_target_std.detach()) p = policy_unprojected(b_obs) # invert scaling with coeff here as we do not have to balance with other losses loss = self.get_trust_region_loss( policy, p, proj_p) / self.trust_region_coeff optim_reg.zero_grad() loss.backward() optim_reg.step() reg_losses += loss.detach() policy.load_state_dict(policy_unprojected.state_dict()) if not policy.contextual_std: # set policy with projection value. # In non-contextual cases we have only one cov, so the projection is the same. policy.set_std(p_target[1][0]) steps = self.regression_iters * (math.ceil( obs.shape[0] / n_minibatches)) return {"regression_loss": (reg_losses / steps).detach()}