class PoWER(ParameterExploring): """ Return-based variant of Policy learning by Weighting Exploration with the Returns (PoWER) .. note:: PoWER was designed for linear policies. PoWER is must use positive reward functions (improper probability distribution) [1, p.10]. The original implementation is tailored to movement primitives like DMPs. .. seealso:: [1] J. Kober and J. Peters, "Policy Search for Motor Primitives in Robotics", Machine Learning, 2011 """ name: str = 'power' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, LinearPolicy): print_cbt_once('PoWER was designed for linear policies.', 'y') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param) def reset(self, seed: int = None): # Reset the exploration strategy, internal variables and the random seeds super().reset(seed) # Reset memory for importance sampling self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param) @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.tensor(param_results.mean_returns) if any(rets_avg_ros < 0): rets_avg_ros[rets_avg_ros < 0] = 1e-3 print_cbt( 'PoWER is must use positive reward functions (improper probability distribution)!', 'r') # We do the simplification from the original implementation, which is only valid for the return-based variant W = to.inverse(self._expl_strat.noise.cov) # For importance sampling we select the best rollouts self.is_mem_ret = to.cat([self.is_mem_ret, rets_avg_ros], dim=0) self.is_mem_params = to.cat( [self.is_mem_params, param_results.parameters], dim=0) self.is_mem_W = to.cat( [self.is_mem_W, W.repeat(self.pop_size + 1, 1, 1)], dim=0) # same cov for all rollouts # Descending sort according to return values idcs_dcs = to.argsort(self.is_mem_ret, descending=True) self.is_mem_ret = self.is_mem_ret[idcs_dcs] self.is_mem_params = self.is_mem_params[idcs_dcs, :] self.is_mem_W = self.is_mem_W[idcs_dcs, :, :] # Update the exploration covariance (see [1, p.32]). We use all rollouts to avoid rapid convergence to 0. eps = self.is_mem_params - self._policy.param_values # policy parameter perturbations cov_num = to.einsum('nj,nk,n->jk', eps, eps, self.is_mem_ret) # weighted outer product cov_dnom = sum(self.is_mem_ret) self._expl_strat.adapt(cov=cov_num / (cov_dnom + 1e-8)) # Only memorize the best parameter sets & returns (importance sampling) self.is_mem_ret = self.is_mem_ret[:self.num_is_samples] self.is_mem_params = self.is_mem_params[:self.num_is_samples, :] self.is_mem_W = self.is_mem_W[:self.num_is_samples, :, :] # Update the policy mean (see [1, p.10]) eps = eps[:self.num_is_samples, :] mean_num = to.einsum('njk,nj,n->k', self.is_mem_W, eps, self.is_mem_ret) # weighted dot product mean_dnom = to.einsum('njk,n->jk', self.is_mem_W, self.is_mem_ret) # weighted sum inv_dnom = to.inverse(mean_dnom + 1e-8) self._policy.param_values += to.matmul(inv_dnom, mean_num) # Logging self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, LinearPolicy): print_cbt_once('PoWER was designed for linear policies.', 'y') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param)
class PEPG(ParameterExploring): """ Parameter-Exploring Policy Gradients (PEPG) .. seealso:: [1] F. Sehnke, C. Osendorfer, T. Rueckstiess, A. Graves, J. Peters, J. Schmidhuber, "Parameter-exploring Policy Gradients", Neural Networks, 2010 """ name: str = 'pepg' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: Optional[int] = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, lr: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param lr: learning rate :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1) @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results[1:].mean_returns # Rank policy parameters by return (a.k.a. fitness) rets = rank_transform( rets_avg_ros) if self.transform_returns else rets_avg_ros # Move to PyTorch rets = to.from_numpy(rets).to(to.get_default_dtype()) rets_max = to.max(rets) rets_avg_symm = (rets[:len(param_results) // 2] + rets[len(param_results) // 2:]) / 2. baseline = to.mean(rets) # zero if centered # Compute finite differences for the average return of each solution rets_fds = rets[:len(param_results) // 2] - rets[len(param_results) // 2:] # Get the perturbations (select the first half since they are symmetric) epsilon = param_results.parameters[:len(param_results) // 2, :] - self._policy.param_values if self.normalize_update: # See equation (15, top) in [1] delta_mean = (rets_fds / (2 * rets_max - rets_fds + 1e-6) ) @ epsilon # epsilon = T from [1] else: # See equation (13) in [1] delta_mean = 0.5 * rets_fds @ epsilon # epsilon = T from [1] # Update the mean self.optim.zero_grad() self._policy.param_grad = -delta_mean # PyTorch optimizers are minimizers self.optim.step() # Old version without PyTorch optimizer: self._expl_strat.policy.param_values += delta_mean * self.lr # Update the std S = (epsilon**2 - self._expl_strat.std**2) / self._expl_strat.std if self.normalize_update: # See equation (15, bottom) in [1] delta_std = (rets_avg_symm - baseline) @ S else: # See equation (14) in [1] delta_std = ((rets_avg_symm - baseline) / (rets_max - baseline + 1e-6)) @ S # Bound the change on the exploration standard deviation (i.e. the entropy) delta_std *= self.lr delta_std = clamp_symm(delta_std, self.clip_ratio_std * self._expl_strat.std) new_std = self._expl_strat.std + delta_std self._expl_strat.adapt(std=new_std) # Logging self.logger.add_value('policy param', self._policy.param_values, 4) self.logger.add_value('delta policy param', delta_mean * self.lr, 4) self.logger.add_value('expl strat std', self._expl_strat.std, 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: Optional[int] = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, lr: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param lr: learning rate :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1)
def __init__(self, save_dir: str, env: Env, policy: Policy, distribution, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, num_sampler_envs: int = 4, n_mc_samples_gradient=1, coupling=True, real_env=False, lr: float = 5e-4, optim: str = 'SGD', base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) self._distribution = distribution self._dims = distribution.get_number_of_dims() self._n_mc_samples_gradient = n_mc_samples_gradient self._coupling = coupling self._real_env = real_env # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) if optim == 'SGD': self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1) elif optim == 'Adam': # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr) self.optim = to.optim.Adam( [{ 'params': self._distribution.get_params() }], lr=lr) else: raise NotImplementedError self._iter = 0
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_init_states_per_domain: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self._bound_lo_ret = 1e-3 # the returns must not be negative, clip them to this value if so self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = self._bound_lo_ret * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, extra_expl_std_init: float = 0., extra_expl_decay_iter: int = 10, full_cov: bool = False, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling, indirectly specifies the performance quantile $1 - \rho$ [1] :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal entries of the covariance matirx, set to 0 to disable this functionality :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last iteration in which the additional exploration noise is applied :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values, else a diagonal covariance is used :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not extra_expl_std_init >= 0: raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0') if not extra_expl_decay_iter > 0: raise pyrado.ValueErr(given=extra_expl_decay_iter, g_constraint='0') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) if not num_is_samples <= pop_size: raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size) self.num_is_samples = int(num_is_samples) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=full_cov, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Optionally add additional entropy self.extra_expl_decay_iter = extra_expl_decay_iter if isinstance(self._expl_strat.noise, DiagNormalNoise): self.extra_expl_std_init = to.ones_like( self._policy.param_values) * extra_expl_std_init elif isinstance(self._expl_strat.noise, FullNormalNoise): self.extra_expl_std_init = to.eye( self._policy.num_param) * extra_expl_std_init else: raise pyrado.TypeErr( msg= 'Additional exploration entropy is only implemented for Gaussian distributions,' 'i.e. DiagNormalNoise and FullNormalNoise')
class EMVD(ParameterExploring): """ Episodic Measure-Valued Derivatives (E-MVD) """ name: str = 'mvd' def __init__(self, save_dir: str, env: Env, policy: Policy, distribution, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, num_sampler_envs: int = 4, n_mc_samples_gradient=1, coupling=True, real_env=False, lr: float = 5e-4, optim: str = 'SGD', base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) self._distribution = distribution self._dims = distribution.get_number_of_dims() self._n_mc_samples_gradient = n_mc_samples_gradient self._coupling = coupling self._real_env = real_env # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) if optim == 'SGD': self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1) elif optim == 'Adam': # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr) self.optim = to.optim.Adam( [{ 'params': self._distribution.get_params() }], lr=lr) else: raise NotImplementedError self._iter = 0 def _optimize_distribution_parameters(self, loss): self.optim.zero_grad() loss.backward() self.optim.step() def step(self, snapshot_mode: str, meta_info: dict = None): if not self._real_env: # Sample new policy parameters paramsets = self._expl_strat.sample_param_sets( self._policy.param_values, num_samples=10, # If you do not want to include the current policy parameters, be aware that you also have to do follow-up # changes in the update() functions in all subclasses of ParameterExploring # include_nominal_params=True include_nominal_params=True) with to.no_grad(): # Sample rollouts using these parameters param_samp_res = self.sampler.sample(paramsets) # Evaluate the current policy (first one in list if include_nominal_params is True) ret_avg_curr = param_samp_res[0].mean_undiscounted_return # Store the average return for the stopping criterion self.ret_avg_stack = np.delete(self.ret_avg_stack, 0) self.ret_avg_stack = np.append(self.ret_avg_stack, ret_avg_curr) all_rets = param_samp_res.mean_returns all_lengths = np.array( [len(ro) for pss in param_samp_res for ro in pss.rollouts]) # Log metrics computed from the old policy (before the update) self.logger.add_value('curr policy return', ret_avg_curr) self.logger.add_value('max return', float(np.max(all_rets))) self.logger.add_value('median return', float(np.median(all_rets))) self.logger.add_value('avg return', float(np.mean(all_rets))) self.logger.add_value('std return', float(np.std(all_rets))) self.logger.add_value('avg rollout len', float(np.mean(all_lengths))) # self.logger.add_value('min mag policy param', # self._policy.param_values[to.argmin(abs(self._policy.param_values))]) # self.logger.add_value('max mag policy param', # self._policy.param_values[to.argmax(abs(self._policy.param_values))]) # Logging self.logger.add_value('policy param', self._policy.param_values.detach().numpy()) self.logger.add_value( 'expl strat mean', deepcopy(self._distribution.get_mean(tensor=False))) self.logger.add_value( 'expl strat cov', deepcopy(self._distribution.get_cov(tensor=False))) # Extract the best policy parameter sample for saving it later self.best_policy_param = param_samp_res.parameters[np.argmax( param_samp_res.mean_returns)].clone() # Save snapshot data self.make_snapshot(snapshot_mode, float(np.max(param_samp_res.mean_returns)), meta_info) # Update the policy self.update(param_samp_res) else: input("Change video and Press ENTER") print('policy param: ', self._policy.param_values.detach().numpy()) print('expl strat mean: ', self._distribution.get_mean(tensor=False)) print('expl strat std: ', np.sqrt(self._distribution.get_cov(tensor=False))) ro = rollout(self._env, self.policy, eval=True, render_mode=RenderMode(text=False)) r = np.sum(ro.rewards) print('curr policy return: ', r) # Update the policy self.update() def policy_return(self, Ks): with to.no_grad(): r_l = [] for i, K in enumerate(Ks): print("{}/{} - K: {}".format(i, len(K) - 1, K.view(-1)), end=" ") self.policy.param_values = K ro = rollout(self._env, self.policy, eval=True, render_mode=RenderMode(text=False)) r = np.sum(ro.rewards) r_l.append(r) print(" - r: {}".format(r)) return r_l def update(self, param_results: ParameterSamplingResult = None, ret_avg_curr: float = None): loss = -self._mvd_gaussian_diag_covariance_surrogate_loss().mean() self._optimize_distribution_parameters(loss) # Update the policy parameters to the mean of the search distribution self._policy.param_values = self._distribution.get_mean( tensor=True).view(-1) def _mvd_gaussian_diag_covariance_surrogate_loss(self): """ Builds the loss function for gradient computation with measure value derivatives. The gradient is taken wrt the distributional parameters (mean and covariance) of a Multivariate Gaussian with Diagonal Covariance. """ mean, std = self._distribution.get_mean_and_std() diag_std = std dist_samples = self._distribution.sample( (self._n_mc_samples_gradient, )) # Compute gradient wrt mean grad_mean = self._mvd_grad_mean_gaussian_diagonal_covariance( dist_samples) # Compute gradient wrt std grad_cov = self._mvd_grad_covariance_gaussian_diagonal_covariance( dist_samples) # Construct the surrogate loss. # Here we still backpropagate through the mean and covariance, because they can themselves be parametrized surrogate_loss = grad_mean.detach() * mean surrogate_loss += grad_cov.detach() * diag_std # The total derivative is the sum of the partial derivatives wrt each parameter. loss = surrogate_loss.sum(dim=-1) return loss def _mvd_grad_mean_gaussian_diagonal_covariance(self, dist_samples): """ Computes the measure valued gradient wrt the mean of the multivariate Gaussian with diagonal Covariance. """ print("----Grad mean", flush=True) mean, std = self._distribution.get_mean_and_std() diag_std = std # Replicate the second to last dimension # (B, D, D) multiples = [1, self._dims, 1] base_samples = to.unsqueeze(dist_samples, -2).repeat(*multiples) # Sample (B, D) samples from the positive and negative Univariate Weibull distributions weibull = torchdist.weibull.Weibull(scale=np.sqrt(2.), concentration=2.) pos_samples_weibull = weibull.sample(dist_samples.shape) if self._coupling: neg_samples_weibull = pos_samples_weibull else: neg_samples_weibull = weibull.sample(dist_samples.shape) # Build the (B, D) positive and negative diagonals of the MVD decomposition positive_diag = mean + diag_std * pos_samples_weibull assert positive_diag.shape == dist_samples.shape negative_diag = mean - diag_std * neg_samples_weibull assert negative_diag.shape == dist_samples.shape # Set the positive and negative points where to evaluate the Q function. # (B, D, D) # Replace the ith dimension of the actions with the ith entry of the constructed diagonals. # Mohamed. S, 2019, Monte Carlo Gradient Estimation in Machine Learning, Ch. 6.2 positive_samples = base_samples.clone() positive_samples.diagonal(dim1=-2, dim2=-1).copy_(positive_diag) negative_samples = base_samples.clone() negative_samples.diagonal(dim1=-2, dim2=-1).copy_(negative_diag) # MVD constant term # (B, D) c = np.sqrt(2 * np.pi) * diag_std # Evaluate the function # pos_f_samples = self._func.eval(positive_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims)) # neg_f_samples = self._func.eval(negative_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims)) if not self._real_env: pos_paramsets = positive_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims) pos_f_samples_param_samp_res = self.sampler.sample( pos_paramsets.detach()) r_l = [] for i in range(len(pos_f_samples_param_samp_res)): r_l.append( pos_f_samples_param_samp_res[i].mean_undiscounted_return) pos_f_samples = to.tensor(r_l) neg_paramsets = negative_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims) neg_f_samples_param_samp_res = self.sampler.sample( neg_paramsets.detach()) r_l = [] for i in range(len(neg_f_samples_param_samp_res)): r_l.append( neg_f_samples_param_samp_res[i].mean_undiscounted_return) neg_f_samples = to.tensor(r_l) else: pos_f_samples = to.tensor( self.policy_return( positive_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims))) neg_f_samples = to.tensor( self.policy_return( negative_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims))) # Gradient batch # (B, D) delta_f = pos_f_samples - neg_f_samples grad = delta_f.reshape(dist_samples.shape[0], self._dims) / c assert grad.shape == dist_samples.shape return grad def _mvd_grad_covariance_gaussian_diagonal_covariance(self, dist_samples): """ Computes the measure valued gradient wrt the covariance of the multivariate Gaussian with diagonal covariance. """ print("----Grad covariance", flush=True) mean, std = self._distribution.get_mean_and_std() diag_std = std # Replicate the second to last dimension of actions # (B, D, D) multiples = [1, self._dims, 1] base_actions = to.unsqueeze(dist_samples, -2).repeat(*multiples) # Sample (NxBxDa, Da) samples from the positive and negative Univariate distributions of the decomposition. # The positive part is a Double-sided Maxwell M(mu, sigma^2). # M(x; mu, sigma^2) = 1/(sigma*sqrt(2*pi)) * ((x-mu)/sigma)^2 * exp(-1/2*((x-mu)/sigma)^2) # To sample Y from the Double-sided Maxwell M(mu, sigma^2) we can do # X ~ M(0, 1) -> Y = mu + sigma * X # The negative part is a Gaussian distribution N(mu, sigma^2). # To sample Y from the Gaussian N(mu, sigma^2) we can do # X ~ N(0, 1) -> Y = mu + sigma * X double_sided_maxwell_standard = DoubleSidedStandardMaxwell() pos_samples_double_sided_maxwell_standard = double_sided_maxwell_standard.sample( dist_samples.shape) if self._coupling: # Construct standard Gaussian samples from standard Double-sided Maxwell samples neg_samples_gaussian_standard = std_gaussian_from_std_dsmaxwell( pos_samples_double_sided_maxwell_standard) else: gaussian_standard = torchdist.normal.Normal(loc=0., scale=1.) neg_samples_gaussian_standard = gaussian_standard.sample( dist_samples.shape) pos_samples_double_sided_maxwell_standard = pos_samples_double_sided_maxwell_standard # Build the (B, D) positive and negative diagonals of the MVD decomposition positive_diag = mean + diag_std * pos_samples_double_sided_maxwell_standard assert positive_diag.shape == dist_samples.shape negative_diag = mean + diag_std * neg_samples_gaussian_standard assert negative_diag.shape == dist_samples.shape # Set the positive and negative points where to evaluate the Q function. # (B, D, D) # In multivariate Gaussians with diagonal covariance, the univariates are independent. # Hence we can replace the ith dimension of the sampled actions with the ith entry of the constructed diagonals. # Mohamed. S, 2019, Monte Carlo Gradient Estimation in Machine Learning, Ch. 6.2 positive_samples = base_actions.clone() positive_samples.diagonal(dim1=-2, dim2=-1).copy_(positive_diag) negative_samples = base_actions.clone() negative_samples.diagonal(dim1=-2, dim2=-1).copy_(negative_diag) # MVD constant term # (B, D) c = diag_std # Evaluate the function # pos_f_samples = self._func.eval(positive_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims)) # neg_f_samples = self._func.eval(negative_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims)) if not self._real_env: pos_paramsets = positive_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims) pos_f_samples_param_samp_res = self.sampler.sample( pos_paramsets.detach()) r_l = [] for i in range(len(pos_f_samples_param_samp_res)): r_l.append( pos_f_samples_param_samp_res[i].mean_undiscounted_return) pos_f_samples = to.tensor(r_l) neg_paramsets = negative_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims) neg_f_samples_param_samp_res = self.sampler.sample( neg_paramsets.detach()) r_l = [] for i in range(len(neg_f_samples_param_samp_res)): r_l.append( neg_f_samples_param_samp_res[i].mean_undiscounted_return) neg_f_samples = to.tensor(r_l) else: pos_f_samples = to.tensor( self.policy_return( positive_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims))) neg_f_samples = to.tensor( self.policy_return( negative_samples.reshape( self._n_mc_samples_gradient * self._dims, self._dims))) # Gradient batch # (B, D) delta_f = pos_f_samples - neg_f_samples grad = delta_f.reshape(dist_samples.shape[0], self._dims) / c assert grad.shape == dist_samples.shape return grad
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, eta_mean: float = 1., eta_std: float = None, symm_sampling: bool = False, transform_returns: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :param symm_sampling: use an exploration strategy which samples symmetric populations :param transform_returns: use a rank-transformation of the returns to update the policy :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.transform_returns = transform_returns # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise # Symmetric buffer needs to have an even number of samples if self.pop_size % 2 != 0: self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Utility coefficients (ignored for transform_returns = False) # Use pop_size + 1 since we are also considering the current policy eta_std = eta_std if eta_std is not None else ( 3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5. self.eta_mean_util, self.eta_std_util = self.compute_utilities( self.pop_size + 1, eta_mean, eta_std) # Learning rates [2] # Use pop_size + 1 since we are also considering the current policy self.lr_mean = 1. if transform_returns else 1e-2 self.lr_std = 0.6 * ( 3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)
class CEM(ParameterExploring): r""" Cross-Entropy Method (CEM) This implementation is basically Algorithm 3.3. in [1] with the addition of decreasing noise [2]. CEM is closely related to PoWER. The most significant difference is that the importance sampels are not kept over iterations and that the covariance matrix is not scaled with the returns, thus allowing for negative returns. .. seealso:: [1] P.T. de Boer, D.P. Kroese, S. Mannor, R.Y. Rubinstein, "A Tutorial on the Cross-Entropy Method", Annals OR, 2005 [2] I. Szita, A. Lörnicz, "Learning Tetris Using the NoisyCross-Entropy Method", Neural Computation, 2006 """ name: str = 'cem' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, extra_expl_std_init: float = 0., extra_expl_decay_iter: int = 10, full_cov: bool = False, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling, indirectly specifies the performance quantile $1 - \rho$ [1] :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal entries of the covariance matirx, set to 0 to disable this functionality :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last iteration in which the additional exploration noise is applied :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values, else a diagonal covariance is used :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not extra_expl_std_init >= 0: raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0') if not extra_expl_decay_iter > 0: raise pyrado.ValueErr(given=extra_expl_decay_iter, g_constraint='0') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) if not num_is_samples <= pop_size: raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size) self.num_is_samples = int(num_is_samples) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=full_cov, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Optionally add additional entropy self.extra_expl_decay_iter = extra_expl_decay_iter if isinstance(self._expl_strat.noise, DiagNormalNoise): self.extra_expl_std_init = to.ones_like( self._policy.param_values) * extra_expl_std_init elif isinstance(self._expl_strat.noise, FullNormalNoise): self.extra_expl_std_init = to.eye( self._policy.num_param) * extra_expl_std_init else: raise pyrado.TypeErr( msg= 'Additional exploration entropy is only implemented for Gaussian distributions,' 'i.e. DiagNormalNoise and FullNormalNoise') @to.no_grad() def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = to.tensor(param_results.mean_returns) # Descending sort according to return values and the importance samples a.k.a. elites (see [1, p.12]) idcs_dcs = to.argsort(rets_avg_ros, descending=True) idcs_dcs = idcs_dcs[:self.num_is_samples] rets_avg_is = rets_avg_ros[idcs_dcs] params_is = param_results.parameters[idcs_dcs, :] # Update the policy parameters from the mean importance samples self._policy.param_values = to.mean(params_is, dim=0) # Update the exploration covariance from the empirical variance of the importance samples if isinstance(self._expl_strat.noise, DiagNormalNoise): std_is = to.std(params_is, dim=0) extra_expl_std = self.extra_expl_std_init * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(std=std_is + extra_expl_std) elif isinstance(self._expl_strat.noise, FullNormalNoise): cov_is = cov(params_is, data_along_rows=True) extra_expl_cov = to.pow(self.extra_expl_std_init, 2) * max( 1. - self._curr_iter / self.extra_expl_decay_iter, 0 # see [2, p.4] ) self._expl_strat.noise.adapt(cov=cov_is + extra_expl_cov) # Logging self.logger.add_value('median imp samp return', to.median(rets_avg_is), 4) self.logger.add_value('min imp samp return', to.min(rets_avg_is), 4) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
class NES(ParameterExploring): """ Simplified variant of Natural Evolution Strategies (NES) .. seealso:: [1] D. Wierstra, T. Schaul, T. Glasmachers, Y. Sun, J. Peters, J. Schmidhuber, "Natural Evolution Strategies", JMLR, 2014 [2] This implementation was inspired by https://github.com/pybrain/pybrain/blob/master/pybrain/optimization/distributionbased/snes.py """ name: str = 'nes' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, eta_mean: float = 1., eta_std: float = None, symm_sampling: bool = False, transform_returns: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :param symm_sampling: use an exploration strategy which samples symmetric populations :param transform_returns: use a rank-transformation of the returns to update the policy :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.transform_returns = transform_returns # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise # Symmetric buffer needs to have an even number of samples if self.pop_size % 2 != 0: self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Utility coefficients (ignored for transform_returns = False) # Use pop_size + 1 since we are also considering the current policy eta_std = eta_std if eta_std is not None else ( 3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5. self.eta_mean_util, self.eta_std_util = self.compute_utilities( self.pop_size + 1, eta_mean, eta_std) # Learning rates [2] # Use pop_size + 1 since we are also considering the current policy self.lr_mean = 1. if transform_returns else 1e-2 self.lr_std = 0.6 * ( 3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1) @staticmethod def compute_utilities(pop_size: Optional[int], eta_mean: float, eta_std: float): """ Compute the utilities as described in section 3.1 of [1] (a.k.a. Hansen ranking with uniform baseline) :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :return: utility coefficient for the mean, and utility coefficient for the standard deviation """ # Compute common utility vector log_half = np.log(pop_size / 2. + 1) log_k = np.log(np.arange(1, pop_size + 1)) num = np.maximum(0, log_half - log_k) utils = num / np.sum(num) - 1. / pop_size # Convert to PyTorch tensors eta_mean_util = to.from_numpy(eta_mean * utils).to( to.get_default_dtype()) eta_std_util = to.from_numpy(eta_std / 2. * utils).to( to.get_default_dtype()) return eta_mean_util, eta_std_util def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns # Get the perturbations (deltas from the current policy parameters) s = param_results.parameters - self._policy.param_values # also divide by the standard deviation to fully standardize s /= self._expl_strat.std if self.transform_returns: # Ascending sort according to return values idcs_acs = np.argsort(rets_avg_ros)[::-1] s_asc = s[list(idcs_acs), :] # Update the mean (see [1, 2]) delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc) self._policy.param_values += self.lr_mean * delta_mean # Update the std (see [1, 2]) grad_std = self.eta_std_util @ (s_asc**2 - 1.) new_std = self._expl_strat.std * to.exp( self.lr_std * grad_std / 2.) self._expl_strat.adapt(std=new_std) else: # Standardize averaged returns over all pop_size rollouts rets_stdized = standardize(rets_avg_ros) rets_stdized = to.from_numpy(rets_stdized).to( to.get_default_dtype()) # delta_mean = 1./len(param_results) * (rets_stdized @ s) delta_mean = 1. / (self._expl_strat.std * len(param_results)) * (rets_stdized @ s) self._policy.param_values += self.lr_mean * delta_mean # Update the std (monotonous exponential decay) new_std = self._expl_strat.std * 0.999**self._curr_iter self._expl_strat.adapt(std=new_std) self.logger.add_value('min expl strat std', to.min(self._expl_strat.std), 4) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std), 4) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std), 4) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy(), 4)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, eps: float, gamma: float, num_rollouts: int, pop_size: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_sampler_envs: int = 4, num_epoch_dual: int = 1000, use_map: bool = False, grad_free_optim: bool = False, lr_dual: float = 5e-4, base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param gamma: temporal discount factor; equal to 1 - reset probability :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_epoch_dual: number of epochs for the minimization of the dual function :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`) :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ if not isinstance(policy, LinearPolicy): warn('REPS is designed for linear policies only!', UserWarning) # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) # Store the inputs self.eps = eps self.gamma = gamma self.base_seed = base_seed self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) self.kappa = to.tensor([0.], requires_grad=True) # eta = exp(kappa) self._exp_min = -700. self._exp_max = 700. # Dual specific if grad_free_optim: self.optim_dual = GSS( [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4])) ) else: self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5) # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4) self.num_epoch_dual = num_epoch_dual
class REPS(ParameterExploring): """ Episodic variant of Relative Entropy Policy Search (REPS) .. seealso:: [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010 [2] This implementation was inspired by https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py """ name: str = 'reps' def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, eps: float, gamma: float, num_rollouts: int, pop_size: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_sampler_envs: int = 4, num_epoch_dual: int = 1000, use_map: bool = False, grad_free_optim: bool = False, lr_dual: float = 5e-4, base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param gamma: temporal discount factor; equal to 1 - reset probability :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_epoch_dual: number of epochs for the minimization of the dual function :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`) :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ if not isinstance(policy, LinearPolicy): warn('REPS is designed for linear policies only!', UserWarning) # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) # Store the inputs self.eps = eps self.gamma = gamma self.base_seed = base_seed self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) self.kappa = to.tensor([0.], requires_grad=True) # eta = exp(kappa) self._exp_min = -700. self._exp_max = 700. # Dual specific if grad_free_optim: self.optim_dual = GSS( [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4])) ) else: self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5) # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4) self.num_epoch_dual = num_epoch_dual @property def eta(self) -> to.Tensor: r""" Get $\eta = e^{\kappa}$. """ return to.exp(self.kappa) def weights(self, rets: to.Tensor) -> to.Tensor: """ Compute the wights which are used to weights thy policy samples by their return :param rets: return values per policy sample after averaging over multiple rollouts using the same policy """ shifted_rets = rets - to.max(rets) return to.exp(to.clamp(shifted_rets / self.eta, self._exp_min, self._exp_max)) def dual(self, rets: to.Tensor) -> to.Tensor: """ Compute the REPS dual function value. :param: dual loss value """ w = self.weights(rets) return self.eta * self.eps + to.max(rets) + self.eta * to.log(to.mean(w)) def policy_dual(self, param_samples: to.Tensor, w: to.Tensor) -> to.Tensor: """ Compute the REPS policy-dual function value. :param param_samples: :param w: sample weights :return: dual loss value """ distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) self.wml(param_samples, w, eta=self.eta) distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) logprobs = distr_new.log_prob(param_samples) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL return w @ logprobs + self.eta * (self.eps - kl_e) def minimize(self, loss_fcn: Callable, rets: to.Tensor = None, param_samples: to.Tensor = None, w: to.Tensor = None): """ Minimize the given dual function. Iterate num_epoch_dual times. :param loss_fcn: function to minimize :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :param param_samples: all sampled policy parameters :param w: sample weights """ if isinstance(self.optim_dual, GSS): self.optim_dual.reset() for _ in tqdm(range(self.num_epoch_dual), total=self.num_epoch_dual, desc=f'Minimizing dual', unit='epochs', file=sys.stdout, leave=False): if not isinstance(self.optim_dual, GSS): # Reset the gradients self.optim_dual.zero_grad() # Compute value function loss if rets is not None and param_samples is None and w is None: loss = loss_fcn(rets) # dual elif rets is None and param_samples is not None and w is not None: loss = loss_fcn(param_samples, w) # policy dual else: raise NotImplementedError # Update the parameter if isinstance(self.optim_dual, GSS): if rets is not None and param_samples is None and w is None: self.optim_dual.step(closure=functools.partial(loss_fcn, rets=rets)) elif rets is None and param_samples is not None and w is not None: self.optim_dual.step(closure=functools.partial(loss_fcn, param_samples=param_samples, w=w)) else: raise NotImplementedError else: loss.backward() self.optim_dual.step() if to.isnan(self.kappa): raise RuntimeError(f"The dual's optimization parameter kappa became NaN!") def wml(self, param_samples: to.Tensor, w: to.Tensor, eta: to.Tensor = to.tensor([0.])): """ Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: sample weights :param eta: dual parameters """ mean_old = self._policy.param_values.clone() cov_old = self._expl_strat.cov.clone() # Update the mean self._policy.param_values = (eta * mean_old + to.sum(w.view(-1, 1) * param_samples, dim=0)) / (to.sum(w) + eta) param_values_delta = self._policy.param_values - mean_old # Difference between all sampled policy parameters and the updated policy diff = param_samples - self._policy.param_values w_diff = to.einsum('nk,n,nh->kh', diff, w, diff) # outer product of scaled diff, then sum over all samples # Update the covariance cov_new = (w_diff + eta * cov_old + eta * to.einsum('k,h->kh', param_values_delta, param_values_delta) ) / (to.sum(w) + eta) self._expl_strat.adapt(cov=cov_new) def wmap(self, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: sample weights """ # Optimize for eta self.minimize(self.policy_dual, param_samples=param_samples, w=w.detach()) # Update policy parameters self.wml(param_samples, w.detach(), eta=self.eta) def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns rets_avg_ros = to.from_numpy(rets_avg_ros) # Reset dual's parameter self.kappa.data.fill_(0.) # Dual with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) loss = self.dual(rets_avg_ros) self.logger.add_value('dual loss before', loss.item()) self.minimize(self.dual, rets=rets_avg_ros) with to.no_grad(): loss = self.dual(rets_avg_ros) self.logger.add_value('dual loss after', loss.item()) self.logger.add_value('eta', self.eta.item()) # Compute the weights using the optimized eta w = self.weights(rets_avg_ros) # Update the policy's mean and the exploration strategy's covariance if self.use_map: self.wml(param_results.parameters, w) else: self.wmap(param_results.parameters, w) # Logging distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL kl_i = kl_divergence(distr_old, distr_new) # mean seeking a.k.a. inclusive KL self.logger.add_value('min expl strat std', to.min(self._expl_strat.std)) self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std.data).detach().numpy()) self.logger.add_value('max expl strat std', to.max(self._expl_strat.std)) self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy().item()) self.logger.add_value('KL(new_old)', kl_e.item()) self.logger.add_value('KL(old_new)', kl_i.item())
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, eps: float, num_init_states_per_domain: int, pop_size: Optional[int], expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, softmax_transform: bool = False, use_map: bool = True, optim_mode: Optional[str] = "scipy", num_epoch_dual: int = 1000, lr_dual: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP optimizer from scipy (recommended) :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if `optim_mode = 'scipy'` :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'` :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)): print_cbt_once("REPS was designed for linear policies.", "y") # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Store the inputs self.eps = eps self.softmax_transform = softmax_transform self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=self._policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Dual optimization self.num_epoch_dual = num_epoch_dual self._log_eta = to.tensor([0.0], requires_grad=True) self.optim_mode = optim_mode.lower() if self.optim_mode == "scipy": pass elif self.optim_mode == "torch": self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4) # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5) # used in [2], but unstable here else: raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])
class REPS(ParameterExploring): """ Episodic variant of Relative Entropy Policy Search (REPS) .. note:: REPS was designed for linear policies. .. seealso:: [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010 [2] A. Abdolmaleki, J.T. Springenberg, J. Degrave, S. Bohez, Y. Tassa, D. Belov, N. Heess, M. Riedmiller, "Relative Entropy Regularized Policy Iteration", arXiv, 2018 [3] This implementation is inspired by the work of H. Abdulsamad https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py """ name: Optional[str] = "reps" def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, eps: float, num_init_states_per_domain: int, pop_size: Optional[int], expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, softmax_transform: bool = False, use_map: bool = True, optim_mode: Optional[str] = "scipy", num_epoch_dual: int = 1000, lr_dual: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP optimizer from scipy (recommended) :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if `optim_mode = 'scipy'` :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'` :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)): print_cbt_once("REPS was designed for linear policies.", "y") # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Store the inputs self.eps = eps self.softmax_transform = softmax_transform self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=self._policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Dual optimization self.num_epoch_dual = num_epoch_dual self._log_eta = to.tensor([0.0], requires_grad=True) self.optim_mode = optim_mode.lower() if self.optim_mode == "scipy": pass elif self.optim_mode == "torch": self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4) # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5) # used in [2], but unstable here else: raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"]) @property def eta(self) -> to.Tensor: r"""Get the Lagrange multiplier $\eta$. In [2], $/eta$ is called $/alpha$.""" return to.exp(self._log_eta) def weights(self, rets: to.Tensor) -> to.Tensor: """ Compute the wights which are used to weights thy policy samples by their return. As stated in [2, sec 4.1], we could calculate weights using any rank preserving transformation. :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :return: weights of the policy parameter samples """ if self.softmax_transform: # Do softmax transform (softmax from PyTorch is already numerically stable) return to.softmax(rets / self.eta, dim=0) else: # Do numerically stabilized exp transform return to.exp(to.clamp((rets - to.max(rets)) / self.eta, min=-700.0)) def dual_evaluation( self, eta: Union[to.Tensor, np.ndarray], rets: Union[to.Tensor, np.ndarray] ) -> Union[to.Tensor, np.ndarray]: """ Compute the REPS dual function value for policy evaluation. :param eta: lagrangian multiplier (optimization variable of the dual) :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :return: dual loss value """ if not ( isinstance(eta, to.Tensor) and isinstance(rets, to.Tensor) or isinstance(eta, np.ndarray) and isinstance(rets, np.ndarray) ): raise pyrado.TypeErr(msg="") return eta * self.eps + eta * logmeanexp(rets / eta) def dual_improvement( self, eta: Union[to.Tensor, np.ndarray], param_samples: to.Tensor, w: to.Tensor ) -> Union[to.Tensor, np.ndarray]: """ Compute the REPS dual function value for policy improvement. :param eta: lagrangian multiplier (optimization variable of the dual) :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples :return: dual loss value """ # The sample weights have been computed by minimizing dual_evaluation, don't track the gradient twice assert w.requires_grad is False with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) if self.optim_mode == "scipy" and not isinstance(eta, to.Tensor): # We can arrive there during the 'normal' REPS routine, but also when computing the gradient (jac) for # the scipy optimizer. In the latter case, eta is already a tensor. eta = to.from_numpy(eta).to(to.get_default_dtype()) self.wml(eta, param_samples, w) distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) logprobs = distr_new.log_prob(param_samples) kl = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL if self.optim_mode == "scipy": loss = w.numpy() @ logprobs.numpy() + eta * (self.eps - kl.numpy()) else: loss = w @ logprobs + eta * (self.eps - kl) return loss def minimize( self, loss_fcn: Callable, rets: to.Tensor = None, param_samples: to.Tensor = None, w: to.Tensor = None ): """ Minimize the given dual function. This function can be called for the dual evaluation loss or the dual improvement loss. :param loss_fcn: function to minimize, different for `wml()` and `wmap()` :param rets: return values per policy sample after averaging over multiple rollouts using the same policy :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ if self.optim_mode == "scipy": # Use scipy optimizers if loss_fcn == self.dual_evaluation: res = optimize.minimize( partial(self.dual_evaluation, rets=rets.numpy()), jac=partial(get_grad_via_torch, fcn_to=partial(self.dual_evaluation, rets=rets)), x0=np.array([1.0]), method="SLSQP", bounds=((1e-8, 1e8),), ) elif loss_fcn == self.dual_improvement: res = optimize.minimize( partial(self.dual_improvement, param_samples=param_samples, w=w), jac=partial( get_grad_via_torch, fcn_to=partial(self.dual_improvement, param_samples=param_samples, w=w) ), x0=np.array([1.0]), method="SLSQP", bounds=((1e-8, 1e8),), ) else: raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!") eta = to.from_numpy(res["x"]).to(to.get_default_dtype()) self._log_eta = to.log(eta) else: for _ in tqdm( range(self.num_epoch_dual), total=self.num_epoch_dual, desc=f"Minimizing dual", unit="epochs", file=sys.stdout, leave=False, ): # Use PyTorch optimizers self.optim_dual.zero_grad() if loss_fcn == self.dual_evaluation: loss = self.dual_evaluation(self.eta, rets) elif loss_fcn == self.dual_improvement: loss = self.dual_improvement(self.eta, param_samples, w) else: raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!") loss.backward() self.optim_dual.step() if to.isnan(self._log_eta): raise RuntimeError(f"The dual's optimization parameter _log_eta became NaN!") def wml(self, eta: to.Tensor, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance :param eta: lagrangian multiplier (optimization variable of the dual) :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ mean_old = self._policy.param_values.clone() cov_old = self._expl_strat.cov.clone() # Update the mean w_sum_param_samples = to.einsum("k,kh->h", w, param_samples) self._policy.param_values = (eta * mean_old + w_sum_param_samples) / (to.sum(w) + eta) param_values_delta = self._policy.param_values - mean_old # Difference between all sampled policy parameters and the updated policy diff = param_samples - self._policy.param_values w_diff = to.einsum("nk,n,nh->kh", diff, w, diff) # outer product of scaled diff, then sum over all samples # Update the covariance cov_new = (w_diff + eta * cov_old + eta * to.einsum("k,h->kh", param_values_delta, param_values_delta)) / ( to.sum(w) + eta ) self._expl_strat.adapt(cov=cov_new) def wmap(self, param_samples: to.Tensor, w: to.Tensor): """ Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance :param param_samples: all sampled policy parameters :param w: weights of the policy parameter samples """ # Optimize eta according to the the policy's dual function to satisfy the KL constraint self.minimize(self.dual_improvement, param_samples=param_samples, w=w.detach()) # Update the policy's and exploration strategy's parameters self.wml(self.eta, param_samples, w.detach()) def update(self, param_results: ParameterSamplingResult, ret_avg_curr: Optional[float] = None): # Average the return values over the rollouts rets_avg_ros = param_results.mean_returns rets_avg_ros = to.from_numpy(rets_avg_ros).to(to.get_default_dtype()) with to.no_grad(): distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) loss = self.dual_evaluation(self.eta, rets_avg_ros) self.logger.add_value("dual loss before", loss, 4) # Reset dual's parameter self._log_eta.data.fill_(0.0) # Optimize eta self.minimize(self.dual_evaluation, rets=rets_avg_ros) with to.no_grad(): loss = self.dual_evaluation(self.eta, rets_avg_ros) self.logger.add_value("dual loss after", loss, 4) self.logger.add_value("eta", self.eta, 4) # Compute the weights using the optimized eta w = self.weights(rets_avg_ros) # Update the policy's mean and the exploration strategy's covariance if self.use_map: self.wmap(param_results.parameters, w) # calls self.wml(param_results.parameters, w) else: self.wml(self.eta, param_results.parameters, w) # Logging distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data) kl_e = kl_divergence(distr_new, distr_old) # mode seeking a.k.a. exclusive KL kl_i = kl_divergence(distr_old, distr_new) # mean seeking a.k.a. inclusive KL self.logger.add_value("min expl strat std", to.min(self._expl_strat.std), 4) self.logger.add_value("avg expl strat std", to.mean(self._expl_strat.std), 4) self.logger.add_value("max expl strat std", to.max(self._expl_strat.std), 4) self.logger.add_value("expl strat entropy", self._expl_strat.get_entropy(), 4) self.logger.add_value("KL(new_old)", kl_e, 6) self.logger.add_value("KL(old_new)", kl_i, 6)