class PoWER(ParameterExploring): """ Return-based variant of Policy learning by Weighting Exploration with the Returns (PoWER) .. note:: PoWER was designed for linear policies. PoWER is must use positive reward functions (improper probability distribution) [1, p.10]. The original implementation is tailored to movement primitives like DMPs. .. seealso:: [1] J. Kober and J. Peters, "Policy Search for Motor Primitives in Robotics", Machine Learning, 2011 """ name: str = 'power' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, LinearPolicy): print_cbt_once('PoWER was designed for linear policies.', 'y') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param) def reset(self, seed: int = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) # Reset memory for importance sampling self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param) @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.tensor(param_results.mean_returns) if any(rets_avg_ros < 0): rets_avg_ros[rets_avg_ros < 0] = 1e-3 print_cbt( 'PoWER is must use positive reward functions (improper probability distribution)!', 'r') # We do the simplification from the original implementation, which is only valid for the return-based variant W = to.inverse(self._expl_strat.noise.cov) # For importance sampling we select the best rollouts self.is_mem_ret = to.cat([self.is_mem_ret, rets_avg_ros], dim=0) self.is_mem_params = to.cat( [self.is_mem_params, param_results.parameters], dim=0) self.is_mem_W = to.cat( [self.is_mem_W, W.repeat(self.pop_size + 1, 1, 1)], dim=0) # same cov for all rollouts # Descending sort according to return values idcs_dcs = to.argsort(self.is_mem_ret, descending=True) self.is_mem_ret = self.is_mem_ret[idcs_dcs] self.is_mem_params = self.is_mem_params[idcs_dcs, :] self.is_mem_W = self.is_mem_W[idcs_dcs, :, :] # Update the exploration covariance (see [1, p.32]). We use all rollouts to avoid rapid convergence to 0. eps = self.is_mem_params - self._policy.param_values # policy parameter perturbations cov_num = to.einsum('nj,nk,n->jk', eps, eps, self.is_mem_ret) # weighted outer product cov_dnom = sum(self.is_mem_ret) self._expl_strat.adapt(cov=cov_num / (cov_dnom + 1e-8)) # Only memorize the best parameter sets & returns (importance sampling) self.is_mem_ret = self.is_mem_ret[:self.num_is_samples] self.is_mem_params = self.is_mem_params[:self.num_is_samples, :] self.is_mem_W = self.is_mem_W[:self.num_is_samples, :, :] # Update the policy mean (see [1, p.10]) eps = eps[:self.num_is_samples, :] mean_num = to.einsum('njk,nj,n->k', self.is_mem_W, eps, self.is_mem_ret) # weighted dot product mean_dnom = to.einsum('njk,n->jk', self.is_mem_W, self.is_mem_ret) # weighted sum inv_dnom = to.inverse(mean_dnom + 1e-8) self._policy.param_values += to.matmul(inv_dnom, mean_num) # Logging self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
class PEPG(ParameterExploring): """ Parameter-Exploring Policy Gradients (PEPG) .. seealso:: [1] F. Sehnke, C. Osendorfer, T. Rueckstiess, A. Graves, J. Peters, J. Schmidhuber, "Parameter-exploring Policy Gradients", Neural Networks, 2010 """ name: str = 'pepg' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: Optional[int] = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, lr: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param lr: learning rate :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1) @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results[1:].mean_returns # Rank policy parameters by return (a.k.a. fitness) rets = rank_transform( rets_avg_ros) if self.transform_returns else rets_avg_ros # Move to PyTorch rets = to.from_numpy(rets).to(to.get_default_dtype()) rets_max = to.max(rets) rets_avg_symm = (rets[:len(param_results) // 2] + rets[len(param_results) // 2:]) / 2. baseline = to.mean(rets) # zero if centered # Compute finite differences for the average return of each solution rets_fds = rets[:len(param_results) // 2] - rets[len(param_results) // 2:] # Get the perturbations (select the first half since they are symmetric) epsilon = param_results.parameters[:len(param_results) // 2, :] - self._policy.param_values if self.normalize_update: # See equation (15, top) in [1] delta_mean = (rets_fds / (2 * rets_max - rets_fds + 1e-6) ) @ epsilon # epsilon = T from [1] else: # See equation (13) in [1] delta_mean = 0.5 * rets_fds @ epsilon # epsilon = T from [1] # Update the mean self.optim.zero_grad() self._policy.param_grad = -delta_mean # PyTorch optimizers are minimizers self.optim.step() # Old version without PyTorch optimizer: self._expl_strat.policy.param_values += delta_mean * self.lr # Update the std S = (epsilon**2 - self._expl_strat.std**2) / self._expl_strat.std if self.normalize_update: # See equation (15, bottom) in [1] delta_std = (rets_avg_symm - baseline) @ S else: # See equation (14) in [1] delta_std = ((rets_avg_symm - baseline) / (rets_max - baseline + 1e-6)) @ S # Bound the change on the exploration standard deviation (i.e. the entropy) delta_std *= self.lr delta_std = clamp_symm(delta_std, self.clip_ratio_std * self._expl_strat.std) new_std = self._expl_strat.std + delta_std self._expl_strat.adapt(std=new_std) # Logging self.logger.add_value('policy param', self._policy.param_values, 4) self.logger.add_value('delta policy param', delta_mean * self.lr, 4) self.logger.add_value('expl strat std', self._expl_strat.std, 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
class CEM(ParameterExploring): r""" Cross-Entropy Method (CEM) This implementation is basically Algorithm 3.3. in [1] with the addition of decreasing noise [2]. CEM is closely related to PoWER. The most significant difference is that the importance sampels are not kept over iterations and that the covariance matrix is not scaled with the returns, thus allowing for negative returns. .. seealso:: [1] P.T. de Boer, D.P. Kroese, S. Mannor, R.Y. Rubinstein, "A Tutorial on the Cross-Entropy Method", Annals OR, 2005 [2] I. Szita, A. Lörnicz, "Learning Tetris Using the NoisyCross-Entropy Method", Neural Computation, 2006 """ name: str = 'cem' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, extra_expl_std_init: float = 0., extra_expl_decay_iter: int = 10, full_cov: bool = False, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling, indirectly specifies the performance quantile $1 - \rho$ [1] :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal entries of the covariance matirx, set to 0 to disable this functionality :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last iteration in which the additional exploration noise is applied :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values, else a diagonal covariance is used :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not extra_expl_std_init >= 0: raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0') if not extra_expl_decay_iter > 0: raise pyrado.ValueErr(given=extra_expl_decay_iter, g_constraint='0') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) if not num_is_samples <= pop_size: raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size) self.num_is_samples = int(num_is_samples) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=full_cov, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Optionally add additional entropy self.extra_expl_decay_iter = extra_expl_decay_iter if isinstance(self._expl_strat.noise, DiagNormalNoise): self.extra_expl_std_init = to.ones_like( self._policy.param_values) * extra_expl_std_init elif isinstance(self._expl_strat.noise, FullNormalNoise): self.extra_expl_std_init = to.eye( self._policy.num_param) * extra_expl_std_init else: raise pyrado.TypeErr( msg= 'Additional exploration entropy is only implemented for Gaussian distributions,' 'i.e. DiagNormalNoise and FullNormalNoise') @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.tensor(param_results.mean_returns) # Descending sort according to return values and the importance samples a.k.a. elites (see [1, p.12]) idcs_dcs = to.argsort(rets_avg_ros, descending=True) idcs_dcs = idcs_dcs[:self.num_is_samples] rets_avg_is = rets_avg_ros[idcs_dcs] params_is = param_results.parameters[idcs_dcs, :] # Update the policy parameters from the mean importance samples self._policy.param_values = to.mean(params_is, dim=0) # Update the exploration covariance from the empirical variance of the importance samples if isinstance(self._expl_strat.noise, DiagNormalNoise): std_is = to.std(params_is, dim=0) extra_expl_std = self.extra_expl_std_init * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(std=std_is + extra_expl_std) elif isinstance(self._expl_strat.noise, FullNormalNoise): cov_is = cov(params_is, data_along_rows=True) extra_expl_cov = to.pow(self.extra_expl_std_init, 2) * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(cov=cov_is + extra_expl_cov) # Logging self.logger.add_value('median imp samp return', to.median(rets_avg_is), 4) self.logger.add_value('min imp samp return', to.min(rets_avg_is), 4) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
class NES(ParameterExploring): """ Simplified variant of Natural Evolution Strategies (NES) .. seealso:: [1] D. Wierstra, T. Schaul, T. Glasmachers, Y. Sun, J. Peters, J. Schmidhuber, "Natural Evolution Strategies", JMLR, 2014 [2] This implementation was inspired by https://github.com/pybrain/pybrain/blob/master/pybrain/optimization/distributionbased/snes.py """ name: str = 'nes' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, eta_mean: float = 1., eta_std: float = None, symm_sampling: bool = False, transform_returns: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :param symm_sampling: use an exploration strategy which samples symmetric populations :param transform_returns: use a rank-transformation of the returns to update the policy :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.transform_returns = transform_returns # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise # Symmetric buffer needs to have an even number of samples if self.pop_size % 2 != 0: self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Utility coefficients (ignored for transform_returns = False) # Use pop_size + 1 since we are also considering the current policy eta_std = eta_std if eta_std is not None else ( 3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5. self.eta_mean_util, self.eta_std_util = self.compute_utilities( self.pop_size + 1, eta_mean, eta_std) # Learning rates [2] # Use pop_size + 1 since we are also considering the current policy self.lr_mean = 1. if transform_returns else 1e-2 self.lr_std = 0.6 * ( 3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1) @staticmethod def compute_utilities(pop_size: Optional[int], eta_mean: float, eta_std: float): """ Compute the utilities as described in section 3.1 of [1] (a.k.a. Hansen ranking with uniform baseline) :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :return: utility coefficient for the mean, and utility coefficient for the standard deviation """ # Compute common utility vector log_half = np.log(pop_size / 2. + 1) log_k = np.log(np.arange(1, pop_size + 1)) num = np.maximum(0, log_half - log_k) utils = num / np.sum(num) - 1. / pop_size # Convert to PyTorch tensors eta_mean_util = to.from_numpy(eta_mean * utils).to( to.get_default_dtype()) eta_std_util = to.from_numpy(eta_std / 2. * utils).to( to.get_default_dtype()) return eta_mean_util, eta_std_util def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns # Get the perturbations (deltas from the current policy parameters) s = param_results.parameters - self._policy.param_values # also divide by the standard deviation to fully standardize s /= self._expl_strat.std if self.transform_returns: # Ascending sort according to return values idcs_acs = np.argsort(rets_avg_ros)[::-1] s_asc = s[list(idcs_acs), :] # Update the mean (see [1, 2]) delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc) self._policy.param_values += self.lr_mean * delta_mean # Update the std (see [1, 2]) grad_std = self.eta_std_util @ (s_asc**2 - 1.) new_std = self._expl_strat.std * to.exp( self.lr_std * grad_std / 2.) self._expl_strat.adapt(std=new_std) else: # Standardize averaged returns over all pop_size rollouts rets_stdized = standardize(rets_avg_ros) rets_stdized = to.from_numpy(rets_stdized).to( to.get_default_dtype()) # delta_mean = 1./len(param_results) * (rets_stdized @ s) delta_mean = 1. / (self._expl_strat.std * len(param_results)) * (rets_stdized @ s) self._policy.param_values += self.lr_mean * delta_mean # Update the std (monotonous exponential decay) new_std = self._expl_strat.std * 0.999**self._curr_iter self._expl_strat.adapt(std=new_std) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
class REPS(ParameterExploring): """ Episodic variant of Relative Entropy Policy Search (REPS) .. seealso:: [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010 [2] This implementation was inspired by https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py """ name: str = 'reps' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, eps: float, gamma: float, num_rollouts: int, pop_size: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_sampler_envs: int = 4, num_epoch_dual: int = 1000, use_map: bool = False, grad_free_optim: bool = False, lr_dual: float = 5e-4, base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param gamma: temporal discount factor; equal to 1 - reset probability :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_epoch_dual: number of epochs for the minimization of the dual function :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`) :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ if not isinstance(policy, LinearPolicy): warn('REPS is designed for linear policies only!', UserWarning) # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) # Store the inputs self.eps = eps self.gamma = gamma self.base_seed = base_seed self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) self.kappa = to.tensor([0.], requires_grad=True) # eta = exp(kappa) self._exp_min = -700. self._exp_max = 700. # Dual specific if grad_free_optim: self.optim_dual = GSS( [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4])) ) else: self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5) # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4) self.num_epoch_dual = num_epoch_dual @property def eta(self) -> to.Tensor: r""" Get $\eta = e^{\kappa}$. """ return to.exp(self.kappa) def weights(self, rets: to.Tensor) -> to.Tensor: """ Compute the wights which are used to weights thy policy samples by their return :param rets: return values per policy sample after averaging over multiple rollouts using the same policy """ shifted_rets = rets - to.max(rets) return to.exp(to.clamp(shifted_rets / self.eta, self._exp_min, self._exp_max)) def dual(self, rets: to.Tensor) -> to.Tensor: """ Compute the REPS dual function value. :param: dual loss value """ w = self.weights(rets) return self.eta * self.eps + to.max(rets) + self.eta * to.log(to.mean(w)) def policy_dual(self, param_samples: to.Tensor, w: to.Tensor) -> to.Tensor: """ Compute the REPS policy-dual function value. :param param_samples: :param w: sample weights :return: dual loss value """ distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) self.wml(param_samples, w, eta=self.eta) distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) logprobs = distr_new.log_prob(param_samples) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL return w @ logprobs + self.eta * (self.eps - kl_e) def minimize(self, loss_fcn: Callable, rets: to.Tensor = None, param_samples: to.Tensor = None, w: to.Tensor = None): """ Minimize the given dual function. Iterate num_epoch_dual times. :param loss_fcn: function to minimize :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :param param_samples: all sampled policy parameters :param w: sample weights """ if isinstance(self.optim_dual, GSS): self.optim_dual.reset() for _ in tqdm(range(self.num_epoch_dual), total=self.num_epoch_dual, desc=f'Minimizing dual', unit='epochs', file=sys.stdout, leave=False): if not isinstance(self.optim_dual, GSS): # Reset the gradients self.optim_dual.zero_grad() # Compute value function loss if rets is not None and param_samples is None and w is None: loss = loss_fcn(rets) # dual elif rets is None and param_samples is not None and w is not None: loss = loss_fcn(param_samples, w) # policy dual else: raise NotImplementedError # Update the parameter if isinstance(self.optim_dual, GSS): if rets is not None and param_samples is None and w is None: self.optim_dual.step(closure=functools.partial(loss_fcn, rets=rets)) elif rets is None and param_samples is not None and w is not None: self.optim_dual.step(closure=functools.partial(loss_fcn, param_samples=param_samples, w=w)) else: raise NotImplementedError else: loss.backward() self.optim_dual.step() if to.isnan(self.kappa): raise RuntimeError(f"The dual's optimization parameter kappa became NaN!") def wml(self, param_samples: to.Tensor, w: to.Tensor, eta: to.Tensor = to.tensor([0.])): """ Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: sample weights :param eta: dual parameters """ mean_old = self._policy.param_values.clone() cov_old = self._expl_strat.cov.clone() # Update the mean self._policy.param_values = (eta * mean_old + to.sum(w.view(-1, 1) * param_samples, dim=0)) / (to.sum(w) + eta) param_values_delta = self._policy.param_values - mean_old # Difference between all sampled policy parameters and the updated policy diff = param_samples - self._policy.param_values w_diff = to.einsum('nk,n,nh->kh', diff, w, diff) # outer product of scaled diff, then sum over all samples # Update the covariance cov_new = (w_diff + eta * cov_old + eta * to.einsum('k,h->kh', param_values_delta, param_values_delta) ) / (to.sum(w) + eta) self._expl_strat.adapt(cov=cov_new) def wmap(self, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: sample weights """ # Optimize for eta self.minimize(self.policy_dual, param_samples=param_samples, w=w.detach()) # Update policy parameters self.wml(param_samples, w.detach(), eta=self.eta) def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns rets_avg_ros = to.from_numpy(rets_avg_ros) # Reset dual's parameter self.kappa.data.fill_(0.) # Dual with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) loss = self.dual(rets_avg_ros) self.logger.add_value('dual loss before', loss.item()) self.minimize(self.dual, rets=rets_avg_ros) with to.no_grad(): loss = self.dual(rets_avg_ros) self.logger.add_value('dual loss after', loss.item()) self.logger.add_value('eta', self.eta.item()) # Compute the weights using the optimized eta w = self.weights(rets_avg_ros) # Update the policy's mean and the exploration strategy's covariance if self.use_map: self.wml(param_results.parameters, w) else: self.wmap(param_results.parameters, w) # Logging distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL kl_i = kl_divergence(distr_old, distr_new) # mean seeking a.k.a. inclusive KL self.logger.add_value('min expl strat std', to.min(self._expl_strat.std)) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std.data).detach().numpy()) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std)) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy().item()) self.logger.add_value('KL(new_old)', kl_e.item()) self.logger.add_value('KL(old_new)', kl_i.item())
class REPS(ParameterExploring): """ Episodic variant of Relative Entropy Policy Search (REPS) .. note:: REPS was designed for linear policies. .. seealso:: [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010 [2] A. Abdolmaleki, J.T. Springenberg, J. Degrave, S. Bohez, Y. Tassa, D. Belov, N. Heess, M. Riedmiller, "Relative Entropy Regularized Policy Iteration", arXiv, 2018 [3] This implementation is inspired by the work of H. Abdulsamad https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py """ name: Optional[str] = "reps" def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, eps: float, num_init_states_per_domain: int, pop_size: Optional[int], expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, softmax_transform: bool = False, use_map: bool = True, optim_mode: Optional[str] = "scipy", num_epoch_dual: int = 1000, lr_dual: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP optimizer from scipy (recommended) :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if `optim_mode = 'scipy'` :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'` :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)): print_cbt_once("REPS was designed for linear policies.", "y") # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Store the inputs self.eps = eps self.softmax_transform = softmax_transform self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=self._policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Dual optimization self.num_epoch_dual = num_epoch_dual self._log_eta = to.tensor([0.0], requires_grad=True) self.optim_mode = optim_mode.lower() if self.optim_mode == "scipy": pass elif self.optim_mode == "torch": self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4) # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5) # used in [2], but unstable here else: raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"]) @property def eta(self) -> to.Tensor: r"""Get the Lagrange multiplier $\eta$. In [2], $/eta$ is called $/alpha$.""" return to.exp(self._log_eta) def weights(self, rets: to.Tensor) -> to.Tensor: """ Compute the wights which are used to weights thy policy samples by their return. As stated in [2, sec 4.1], we could calculate weights using any rank preserving transformation. :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :return: weights of the policy parameter samples """ if self.softmax_transform: # Do softmax transform (softmax from PyTorch is already numerically stable) return to.softmax(rets / self.eta, dim=0) else: # Do numerically stabilized exp transform return to.exp(to.clamp((rets - to.max(rets)) / self.eta, min=-700.0)) def dual_evaluation( self, eta: Union[to.Tensor, np.ndarray], rets: Union[to.Tensor, np.ndarray] ) -> Union[to.Tensor, np.ndarray]: """ Compute the REPS dual function value for policy evaluation. :param eta: lagrangian multiplier (optimization variable of the dual) :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :return: dual loss value """ if not ( isinstance(eta, to.Tensor) and isinstance(rets, to.Tensor) or isinstance(eta, np.ndarray) and isinstance(rets, np.ndarray) ): raise pyrado.TypeErr(msg="") return eta * self.eps + eta * logmeanexp(rets / eta) def dual_improvement( self, eta: Union[to.Tensor, np.ndarray], param_samples: to.Tensor, w: to.Tensor ) -> Union[to.Tensor, np.ndarray]: """ Compute the REPS dual function value for policy improvement. :param eta: lagrangian multiplier (optimization variable of the dual) :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples :return: dual loss value """ # The sample weights have been computed by minimizing dual_evaluation, don't track the gradient twice assert w.requires_grad is False with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) if self.optim_mode == "scipy" and not isinstance(eta, to.Tensor): # We can arrive there during the 'normal' REPS routine, but also when computing the gradient (jac) for # the scipy optimizer. In the latter case, eta is already a tensor. eta = to.from_numpy(eta).to(to.get_default_dtype()) self.wml(eta, param_samples, w) distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) logprobs = distr_new.log_prob(param_samples) kl = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL if self.optim_mode == "scipy": loss = w.numpy() @ logprobs.numpy() + eta * (self.eps - kl.numpy()) else: loss = w @ logprobs + eta * (self.eps - kl) return loss def minimize( self, loss_fcn: Callable, rets: to.Tensor = None, param_samples: to.Tensor = None, w: to.Tensor = None ): """ Minimize the given dual function. This function can be called for the dual evaluation loss or the dual improvement loss. :param loss_fcn: function to minimize, different for `wml()` and `wmap()` :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ if self.optim_mode == "scipy": # Use scipy optimizers if loss_fcn == self.dual_evaluation: res = optimize.minimize( partial(self.dual_evaluation, rets=rets.numpy()), jac=partial(get_grad_via_torch, fcn_to=partial(self.dual_evaluation, rets=rets)), x0=np.array([1.0]), method="SLSQP", bounds=((1e-8, 1e8),), ) elif loss_fcn == self.dual_improvement: res = optimize.minimize( partial(self.dual_improvement, param_samples=param_samples, w=w), jac=partial( get_grad_via_torch, fcn_to=partial(self.dual_improvement, param_samples=param_samples, w=w) ), x0=np.array([1.0]), method="SLSQP", bounds=((1e-8, 1e8),), ) else: raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!") eta = to.from_numpy(res["x"]).to(to.get_default_dtype()) self._log_eta = to.log(eta) else: for _ in tqdm( range(self.num_epoch_dual), total=self.num_epoch_dual, desc=f"Minimizing dual", unit="epochs", file=sys.stdout, leave=False, ): # Use PyTorch optimizers self.optim_dual.zero_grad() if loss_fcn == self.dual_evaluation: loss = self.dual_evaluation(self.eta, rets) elif loss_fcn == self.dual_improvement: loss = self.dual_improvement(self.eta, param_samples, w) else: raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!") loss.backward() self.optim_dual.step() if to.isnan(self._log_eta): raise RuntimeError(f"The dual's optimization parameter _log_eta became NaN!") def wml(self, eta: to.Tensor, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance :param eta: lagrangian multiplier (optimization variable of the dual) :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ mean_old = self._policy.param_values.clone() cov_old = self._expl_strat.cov.clone() # Update the mean w_sum_param_samples = to.einsum("k,kh->h", w, param_samples) self._policy.param_values = (eta * mean_old + w_sum_param_samples) / (to.sum(w) + eta) param_values_delta = self._policy.param_values - mean_old # Difference between all sampled policy parameters and the updated policy diff = param_samples - self._policy.param_values w_diff = to.einsum("nk,n,nh->kh", diff, w, diff) # outer product of scaled diff, then sum over all samples # Update the covariance cov_new = (w_diff + eta * cov_old + eta * to.einsum("k,h->kh", param_values_delta, param_values_delta)) / ( to.sum(w) + eta ) self._expl_strat.adapt(cov=cov_new) def wmap(self, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ # Optimize eta according to the the policy's dual function to satisfy the KL constraint self.minimize(self.dual_improvement, param_samples=param_samples, w=w.detach()) # Update the policy's and exploration strategy's parameters self.wml(self.eta, param_samples, w.detach()) def update(self, param_results: ParameterSamplingResult, ret_avg_curr: Optional[float] = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns rets_avg_ros = to.from_numpy(rets_avg_ros).to(to.get_default_dtype()) with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) loss = self.dual_evaluation(self.eta, rets_avg_ros) self.logger.add_value("dual loss before", loss, 4) # Reset dual's parameter self._log_eta.data.fill_(0.0) # Optimize eta self.minimize(self.dual_evaluation, rets=rets_avg_ros) with to.no_grad(): loss = self.dual_evaluation(self.eta, rets_avg_ros) self.logger.add_value("dual loss after", loss, 4) self.logger.add_value("eta", self.eta, 4) # Compute the weights using the optimized eta w = self.weights(rets_avg_ros) # Update the policy's mean and the exploration strategy's covariance if self.use_map: self.wmap(param_results.parameters, w) # calls self.wml(param_results.parameters, w) else: self.wml(self.eta, param_results.parameters, w) # Logging distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL kl_i = kl_divergence(distr_old, distr_new) # mean seeking a.k.a. inclusive KL self.logger.add_value("min expl strat std", to.min(self._expl_strat.std), 4) self.logger.add_value("avg expl strat std", to.mean(self._expl_strat.std), 4) self.logger.add_value("max expl strat std", to.max(self._expl_strat.std), 4) self.logger.add_value("expl strat entropy", self._expl_strat.get_entropy(), 4) self.logger.add_value("KL(new_old)", kl_e, 6) self.logger.add_value("KL(old_new)", kl_i, 6)