def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, LinearPolicy): print_cbt_once('PoWER was designed for linear policies.', 'y') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = 1e-6 * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param)
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_init_states_per_domain: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param pop_size: number of solutions in the population :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Initialize memory for importance sampling self._bound_lo_ret = 1e-3 # the returns must not be negative, clip them to this value if so self.num_is_samples = min(pop_size, num_is_samples) self.is_mem_ret = self._bound_lo_ret * to.ones( self.num_is_samples ) # has to be initialized > 0 due to first covariance update self.is_mem_params = to.zeros(self.num_is_samples, self._policy.num_param) self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param, self._policy.num_param)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: Optional[int] = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, lr: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param lr: learning rate :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1)
def __init__(self, save_dir: str, env: Env, policy: Policy, distribution, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, clip_ratio_std: float = 0.05, normalize_update: bool = False, transform_returns: bool = True, num_sampler_envs: int = 4, n_mc_samples_gradient=1, coupling=True, real_env=False, lr: float = 5e-4, optim: str = 'SGD', base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation :param transform_returns: use a rank-transformation of the returns to update the policy :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) self._distribution = distribution self._dims = distribution.get_number_of_dims() self._n_mc_samples_gradient = n_mc_samples_gradient self._coupling = coupling self._real_env = real_env # Store the inputs self.clip_ratio_std = clip_ratio_std self.normalize_update = normalize_update self.transform_returns = transform_returns self.lr = lr # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat( NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, )) if optim == 'SGD': self.optim = to.optim.SGD([{ 'params': self._policy.parameters() }], lr=lr, momentum=0.8, dampening=0.1) elif optim == 'Adam': # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr) self.optim = to.optim.Adam( [{ 'params': self._distribution.get_params() }], lr=lr) else: raise NotImplementedError self._iter = 0
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, pop_size: Optional[int], num_rollouts: int, num_is_samples: int, expl_std_init: float, expl_std_min: float = 0.01, extra_expl_std_init: float = 0., extra_expl_decay_iter: int = 10, full_cov: bool = False, symm_sampling: bool = False, num_workers: int = 4, logger: Optional[StepLogger] = None): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per policy sample :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling, indirectly specifies the performance quantile $1 - \rho$ [1] :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal entries of the covariance matirx, set to 0 to disable this functionality :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last iteration in which the additional exploration noise is applied :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values, else a diagonal covariance is used :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not extra_expl_std_init >= 0: raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0') if not extra_expl_decay_iter > 0: raise pyrado.ValueErr(given=extra_expl_decay_iter, g_constraint='0') # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger, ) if not num_is_samples <= pop_size: raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size) self.num_is_samples = int(num_is_samples) # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=full_cov, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Optionally add additional entropy self.extra_expl_decay_iter = extra_expl_decay_iter if isinstance(self._expl_strat.noise, DiagNormalNoise): self.extra_expl_std_init = to.ones_like( self._policy.param_values) * extra_expl_std_init elif isinstance(self._expl_strat.noise, FullNormalNoise): self.extra_expl_std_init = to.eye( self._policy.num_param) * extra_expl_std_init else: raise pyrado.TypeErr( msg= 'Additional exploration entropy is only implemented for Gaussian distributions,' 'i.e. DiagNormalNoise and FullNormalNoise')
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, num_rollouts: int, expl_std_init: float, expl_std_min: float = 0.01, pop_size: int = None, eta_mean: float = 1., eta_std: float = None, symm_sampling: bool = False, transform_returns: bool = True, num_workers: int = 4, logger: Optional[StepLogger] = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param num_rollouts: number of rollouts per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param pop_size: number of solutions in the population :param eta_mean: step size factor for the mean :param eta_std: step size factor for the standard deviation :param symm_sampling: use an exploration strategy which samples symmetric populations :param transform_returns: use a rank-transformation of the returns to update the policy :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ # Call ParameterExploring's constructor super().__init__(save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, num_workers=num_workers, logger=logger) # Store the inputs self.transform_returns = transform_returns # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise # Symmetric buffer needs to have an even number of samples if self.pop_size % 2 != 0: self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Utility coefficients (ignored for transform_returns = False) # Use pop_size + 1 since we are also considering the current policy eta_std = eta_std if eta_std is not None else ( 3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5. self.eta_mean_util, self.eta_std_util = self.compute_utilities( self.pop_size + 1, eta_mean, eta_std) # Learning rates [2] # Use pop_size + 1 since we are also considering the current policy self.lr_mean = 1. if transform_returns else 1e-2 self.lr_std = 0.6 * ( 3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)
def __init__(self, save_dir: str, env: Env, policy: Policy, max_iter: int, eps: float, gamma: float, num_rollouts: int, pop_size: int, expl_std_init: float, expl_std_min: float = 0.01, symm_sampling: bool = False, num_sampler_envs: int = 4, num_epoch_dual: int = 1000, use_map: bool = False, grad_free_optim: bool = False, lr_dual: float = 5e-4, base_seed: int = None): """ Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param gamma: temporal discount factor; equal to 1 - reset probability :param pop_size: number of solutions in the population :param num_rollouts: number of rollouts per per policy sample :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param num_epoch_dual: number of epochs for the minimization of the dual function :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`) :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable """ if not isinstance(policy, LinearPolicy): warn('REPS is designed for linear policies only!', UserWarning) # Call ParameterExploring's constructor super().__init__( save_dir, env, policy, max_iter, num_rollouts, pop_size=pop_size, base_seed=base_seed, num_sampler_envs=num_sampler_envs, ) # Store the inputs self.eps = eps self.gamma = gamma self.base_seed = base_seed self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) self.kappa = to.tensor([0.], requires_grad=True) # eta = exp(kappa) self._exp_min = -700. self._exp_max = 700. # Dual specific if grad_free_optim: self.optim_dual = GSS( [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4])) ) else: self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5) # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4) self.num_epoch_dual = num_epoch_dual
def __init__( self, save_dir: pyrado.PathLike, env: Env, policy: Policy, max_iter: int, eps: float, num_init_states_per_domain: int, pop_size: Optional[int], expl_std_init: float, expl_std_min: float = 0.01, num_domains: int = 1, symm_sampling: bool = False, softmax_transform: bool = False, use_map: bool = True, optim_mode: Optional[str] = "scipy", num_epoch_dual: int = 1000, lr_dual: float = 5e-4, num_workers: int = 4, logger: Optional[StepLogger] = None, ): r""" Constructor :param save_dir: directory to save the snapshots i.e. the results in :param env: the environment which the policy operates :param policy: policy to be updated :param eps: bound on the KL divergence between policy updates, e.g. 0.1 :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs :param pop_size: number of solutions in the population :param num_init_states_per_domain: number of rollouts to cover the variance over initial states :param num_domains: number of rollouts due to the variance over domain parameters :param expl_std_init: initial standard deviation for the exploration strategy :param expl_std_min: minimal standard deviation for the exploration strategy :param symm_sampling: use an exploration strategy which samples symmetric populations :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP optimizer from scipy (recommended) :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if `optim_mode = 'scipy'` :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'` :param num_workers: number of environments for parallel sampling :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)): print_cbt_once("REPS was designed for linear policies.", "y") # Call ParameterExploring's constructor super().__init__( save_dir=save_dir, env=env, policy=policy, max_iter=max_iter, num_init_states_per_domain=num_init_states_per_domain, num_domains=num_domains, pop_size=pop_size, num_workers=num_workers, logger=logger, ) # Store the inputs self.eps = eps self.softmax_transform = softmax_transform self.use_map = use_map # Explore using normal noise self._expl_strat = NormalParamNoise( self._policy.num_param, full_cov=True, std_init=expl_std_init, std_min=expl_std_min, use_cuda=self._policy.device != "cpu", ) if symm_sampling: # Exploration strategy based on symmetrical normally distributed noise if self.pop_size % 2 != 0: # Symmetric buffer needs to have an even number of samples self.pop_size += 1 self._expl_strat = SymmParamExplStrat(self._expl_strat) # Dual optimization self.num_epoch_dual = num_epoch_dual self._log_eta = to.tensor([0.0], requires_grad=True) self.optim_mode = optim_mode.lower() if self.optim_mode == "scipy": pass elif self.optim_mode == "torch": self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4) # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5) # used in [2], but unstable here else: raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])