Exemplo n.º 1
0
class PoWER(ParameterExploring):
    """
    Return-based variant of Policy learning by Weighting Exploration with the Returns (PoWER)

    .. note::
        PoWER was designed for linear policies.
        PoWER is must use positive reward functions (improper probability distribution) [1, p.10].
        The original implementation is tailored to movement primitives like DMPs.

    .. seealso::
        [1] J. Kober and J. Peters, "Policy Search for Motor Primitives in Robotics", Machine Learning, 2011
    """

    name: str = 'power'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, LinearPolicy):
            print_cbt_once('PoWER was designed for linear policies.', 'y')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)

    def reset(self, seed: int = None):
        # Reset the exploration strategy, internal variables and the random seeds
        super().reset(seed)

        # Reset memory for importance sampling
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)

    @to.no_grad()
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = to.tensor(param_results.mean_returns)
        if any(rets_avg_ros < 0):
            rets_avg_ros[rets_avg_ros < 0] = 1e-3
            print_cbt(
                'PoWER is must use positive reward functions (improper probability distribution)!',
                'r')

        # We do the simplification from the original implementation, which is only valid for the return-based variant
        W = to.inverse(self._expl_strat.noise.cov)

        # For importance sampling we select the best rollouts
        self.is_mem_ret = to.cat([self.is_mem_ret, rets_avg_ros], dim=0)
        self.is_mem_params = to.cat(
            [self.is_mem_params, param_results.parameters], dim=0)
        self.is_mem_W = to.cat(
            [self.is_mem_W, W.repeat(self.pop_size + 1, 1, 1)],
            dim=0)  # same cov for all rollouts

        # Descending sort according to return values
        idcs_dcs = to.argsort(self.is_mem_ret, descending=True)
        self.is_mem_ret = self.is_mem_ret[idcs_dcs]
        self.is_mem_params = self.is_mem_params[idcs_dcs, :]
        self.is_mem_W = self.is_mem_W[idcs_dcs, :, :]

        # Update the exploration covariance (see [1, p.32]). We use all rollouts to avoid rapid convergence to 0.
        eps = self.is_mem_params - self._policy.param_values  # policy parameter perturbations
        cov_num = to.einsum('nj,nk,n->jk', eps, eps,
                            self.is_mem_ret)  # weighted outer product
        cov_dnom = sum(self.is_mem_ret)
        self._expl_strat.adapt(cov=cov_num / (cov_dnom + 1e-8))

        # Only memorize the best parameter sets & returns (importance sampling)
        self.is_mem_ret = self.is_mem_ret[:self.num_is_samples]
        self.is_mem_params = self.is_mem_params[:self.num_is_samples, :]
        self.is_mem_W = self.is_mem_W[:self.num_is_samples, :, :]

        # Update the policy mean (see [1, p.10])
        eps = eps[:self.num_is_samples, :]
        mean_num = to.einsum('njk,nj,n->k', self.is_mem_W, eps,
                             self.is_mem_ret)  # weighted dot product
        mean_dnom = to.einsum('njk,n->jk', self.is_mem_W,
                              self.is_mem_ret)  # weighted sum
        inv_dnom = to.inverse(mean_dnom + 1e-8)
        self._policy.param_values += to.matmul(inv_dnom, mean_num)

        # Logging
        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std), 4)
        self.logger.add_value('avg expl strat std',
                              to.mean(self._expl_strat.std), 4)
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std), 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)
Exemplo n.º 2
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, LinearPolicy):
            print_cbt_once('PoWER was designed for linear policies.', 'y')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Exemplo n.º 3
0
class PEPG(ParameterExploring):
    """
    Parameter-Exploring Policy Gradients (PEPG)

    .. seealso::
        [1] F. Sehnke, C. Osendorfer, T. Rueckstiess, A. Graves, J. Peters, J. Schmidhuber, "Parameter-exploring
        Policy Gradients", Neural Networks, 2010
    """

    name: str = 'pepg'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: Optional[int] = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 lr: float = 5e-4,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param lr: learning rate
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        self.optim = to.optim.SGD([{
            'params': self._policy.parameters()
        }],
                                  lr=lr,
                                  momentum=0.8,
                                  dampening=0.1)

    @to.no_grad()
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results[1:].mean_returns

        # Rank policy parameters by return (a.k.a. fitness)
        rets = rank_transform(
            rets_avg_ros) if self.transform_returns else rets_avg_ros

        # Move to PyTorch
        rets = to.from_numpy(rets).to(to.get_default_dtype())
        rets_max = to.max(rets)
        rets_avg_symm = (rets[:len(param_results) // 2] +
                         rets[len(param_results) // 2:]) / 2.
        baseline = to.mean(rets)  # zero if centered

        # Compute finite differences for the average return of each solution
        rets_fds = rets[:len(param_results) // 2] - rets[len(param_results) //
                                                         2:]

        # Get the perturbations (select the first half since they are symmetric)
        epsilon = param_results.parameters[:len(param_results) //
                                           2, :] - self._policy.param_values

        if self.normalize_update:
            # See equation (15, top) in [1]
            delta_mean = (rets_fds / (2 * rets_max - rets_fds + 1e-6)
                          ) @ epsilon  # epsilon = T from [1]
        else:
            # See equation (13) in [1]
            delta_mean = 0.5 * rets_fds @ epsilon  # epsilon = T from [1]

        # Update the mean
        self.optim.zero_grad()
        self._policy.param_grad = -delta_mean  # PyTorch optimizers are minimizers
        self.optim.step()
        # Old version without PyTorch optimizer: self._expl_strat.policy.param_values += delta_mean * self.lr

        # Update the std
        S = (epsilon**2 - self._expl_strat.std**2) / self._expl_strat.std

        if self.normalize_update:
            # See equation (15, bottom) in [1]
            delta_std = (rets_avg_symm - baseline) @ S
        else:
            # See equation (14) in [1]
            delta_std = ((rets_avg_symm - baseline) /
                         (rets_max - baseline + 1e-6)) @ S

        # Bound the change on the exploration standard deviation (i.e. the entropy)
        delta_std *= self.lr
        delta_std = clamp_symm(delta_std,
                               self.clip_ratio_std * self._expl_strat.std)
        new_std = self._expl_strat.std + delta_std

        self._expl_strat.adapt(std=new_std)

        # Logging
        self.logger.add_value('policy param', self._policy.param_values, 4)
        self.logger.add_value('delta policy param', delta_mean * self.lr, 4)
        self.logger.add_value('expl strat std', self._expl_strat.std, 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)
Exemplo n.º 4
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: Optional[int] = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 lr: float = 5e-4,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param lr: learning rate
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        self.optim = to.optim.SGD([{
            'params': self._policy.parameters()
        }],
                                  lr=lr,
                                  momentum=0.8,
                                  dampening=0.1)
Exemplo n.º 5
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 distribution,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 num_sampler_envs: int = 4,
                 n_mc_samples_gradient=1,
                 coupling=True,
                 real_env=False,
                 lr: float = 5e-4,
                 optim: str = 'SGD',
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        self._distribution = distribution
        self._dims = distribution.get_number_of_dims()

        self._n_mc_samples_gradient = n_mc_samples_gradient
        self._coupling = coupling

        self._real_env = real_env

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        if optim == 'SGD':
            self.optim = to.optim.SGD([{
                'params': self._policy.parameters()
            }],
                                      lr=lr,
                                      momentum=0.8,
                                      dampening=0.1)
        elif optim == 'Adam':
            # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr)
            self.optim = to.optim.Adam(
                [{
                    'params': self._distribution.get_params()
                }], lr=lr)
        else:
            raise NotImplementedError

        self._iter = 0
Exemplo n.º 6
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        pop_size: Optional[int],
        num_init_states_per_domain: int,
        num_is_samples: int,
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self._bound_lo_ret = 1e-3  # the returns must not be negative, clip them to this value if so
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = self._bound_lo_ret * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Exemplo n.º 7
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 extra_expl_std_init: float = 0.,
                 extra_expl_decay_iter: int = 10,
                 full_cov: bool = False,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling,
                               indirectly specifies the performance quantile $1 - \rho$ [1]
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal
                                    entries of the covariance matirx, set to 0 to disable this functionality
        :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last
                                      iteration in which the additional exploration noise is applied
        :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values,
                         else a diagonal covariance is used
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not extra_expl_std_init >= 0:
            raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0')
        if not extra_expl_decay_iter > 0:
            raise pyrado.ValueErr(given=extra_expl_decay_iter,
                                  g_constraint='0')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        if not num_is_samples <= pop_size:
            raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size)
        self.num_is_samples = int(num_is_samples)

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=full_cov,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Optionally add additional entropy
        self.extra_expl_decay_iter = extra_expl_decay_iter
        if isinstance(self._expl_strat.noise, DiagNormalNoise):
            self.extra_expl_std_init = to.ones_like(
                self._policy.param_values) * extra_expl_std_init
        elif isinstance(self._expl_strat.noise, FullNormalNoise):
            self.extra_expl_std_init = to.eye(
                self._policy.num_param) * extra_expl_std_init
        else:
            raise pyrado.TypeErr(
                msg=
                'Additional exploration entropy is only implemented for Gaussian distributions,'
                'i.e. DiagNormalNoise and FullNormalNoise')
Exemplo n.º 8
0
class EMVD(ParameterExploring):
    """
    Episodic Measure-Valued Derivatives (E-MVD)

    """

    name: str = 'mvd'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 distribution,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 num_sampler_envs: int = 4,
                 n_mc_samples_gradient=1,
                 coupling=True,
                 real_env=False,
                 lr: float = 5e-4,
                 optim: str = 'SGD',
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        self._distribution = distribution
        self._dims = distribution.get_number_of_dims()

        self._n_mc_samples_gradient = n_mc_samples_gradient
        self._coupling = coupling

        self._real_env = real_env

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        if optim == 'SGD':
            self.optim = to.optim.SGD([{
                'params': self._policy.parameters()
            }],
                                      lr=lr,
                                      momentum=0.8,
                                      dampening=0.1)
        elif optim == 'Adam':
            # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr)
            self.optim = to.optim.Adam(
                [{
                    'params': self._distribution.get_params()
                }], lr=lr)
        else:
            raise NotImplementedError

        self._iter = 0

    def _optimize_distribution_parameters(self, loss):
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

    def step(self, snapshot_mode: str, meta_info: dict = None):
        if not self._real_env:
            # Sample new policy parameters
            paramsets = self._expl_strat.sample_param_sets(
                self._policy.param_values,
                num_samples=10,
                # If you do not want to include the current policy parameters, be aware that you also have to do follow-up
                # changes in the update() functions in all subclasses of ParameterExploring
                # include_nominal_params=True
                include_nominal_params=True)

            with to.no_grad():
                # Sample rollouts using these parameters
                param_samp_res = self.sampler.sample(paramsets)

            # Evaluate the current policy (first one in list if include_nominal_params is True)
            ret_avg_curr = param_samp_res[0].mean_undiscounted_return

            # Store the average return for the stopping criterion
            self.ret_avg_stack = np.delete(self.ret_avg_stack, 0)
            self.ret_avg_stack = np.append(self.ret_avg_stack, ret_avg_curr)

            all_rets = param_samp_res.mean_returns
            all_lengths = np.array(
                [len(ro) for pss in param_samp_res for ro in pss.rollouts])

            # Log metrics computed from the old policy (before the update)
            self.logger.add_value('curr policy return', ret_avg_curr)
            self.logger.add_value('max return', float(np.max(all_rets)))
            self.logger.add_value('median return', float(np.median(all_rets)))
            self.logger.add_value('avg return', float(np.mean(all_rets)))
            self.logger.add_value('std return', float(np.std(all_rets)))
            self.logger.add_value('avg rollout len',
                                  float(np.mean(all_lengths)))
            # self.logger.add_value('min mag policy param',
            #                       self._policy.param_values[to.argmin(abs(self._policy.param_values))])
            # self.logger.add_value('max mag policy param',
            #                       self._policy.param_values[to.argmax(abs(self._policy.param_values))])

            # Logging
            self.logger.add_value('policy param',
                                  self._policy.param_values.detach().numpy())
            self.logger.add_value(
                'expl strat mean',
                deepcopy(self._distribution.get_mean(tensor=False)))
            self.logger.add_value(
                'expl strat cov',
                deepcopy(self._distribution.get_cov(tensor=False)))

            # Extract the best policy parameter sample for saving it later
            self.best_policy_param = param_samp_res.parameters[np.argmax(
                param_samp_res.mean_returns)].clone()

            # Save snapshot data
            self.make_snapshot(snapshot_mode,
                               float(np.max(param_samp_res.mean_returns)),
                               meta_info)
            # Update the policy
            self.update(param_samp_res)

        else:
            input("Change video and Press ENTER")
            print('policy param: ', self._policy.param_values.detach().numpy())
            print('expl strat mean: ',
                  self._distribution.get_mean(tensor=False))
            print('expl strat std: ',
                  np.sqrt(self._distribution.get_cov(tensor=False)))

            ro = rollout(self._env,
                         self.policy,
                         eval=True,
                         render_mode=RenderMode(text=False))

            r = np.sum(ro.rewards)
            print('curr policy return: ', r)
            # Update the policy
            self.update()

    def policy_return(self, Ks):
        with to.no_grad():
            r_l = []
            for i, K in enumerate(Ks):
                print("{}/{} - K: {}".format(i,
                                             len(K) - 1, K.view(-1)),
                      end=" ")

                self.policy.param_values = K
                ro = rollout(self._env,
                             self.policy,
                             eval=True,
                             render_mode=RenderMode(text=False))
                r = np.sum(ro.rewards)
                r_l.append(r)
                print(" - r: {}".format(r))
            return r_l

    def update(self,
               param_results: ParameterSamplingResult = None,
               ret_avg_curr: float = None):

        loss = -self._mvd_gaussian_diag_covariance_surrogate_loss().mean()

        self._optimize_distribution_parameters(loss)

        # Update the policy parameters to the mean of the search distribution
        self._policy.param_values = self._distribution.get_mean(
            tensor=True).view(-1)

    def _mvd_gaussian_diag_covariance_surrogate_loss(self):
        """
        Builds the loss function for gradient computation with measure value derivatives.
        The gradient is taken wrt the distributional parameters (mean and covariance) of a
        Multivariate Gaussian with Diagonal Covariance.
        """
        mean, std = self._distribution.get_mean_and_std()
        diag_std = std

        dist_samples = self._distribution.sample(
            (self._n_mc_samples_gradient, ))

        # Compute gradient wrt mean
        grad_mean = self._mvd_grad_mean_gaussian_diagonal_covariance(
            dist_samples)

        # Compute gradient wrt std
        grad_cov = self._mvd_grad_covariance_gaussian_diagonal_covariance(
            dist_samples)

        # Construct the surrogate loss.
        # Here we still backpropagate through the mean and covariance, because they can themselves be parametrized
        surrogate_loss = grad_mean.detach() * mean
        surrogate_loss += grad_cov.detach() * diag_std

        # The total derivative is the sum of the partial derivatives wrt each parameter.
        loss = surrogate_loss.sum(dim=-1)

        return loss

    def _mvd_grad_mean_gaussian_diagonal_covariance(self, dist_samples):
        """
        Computes the measure valued gradient wrt the mean of the multivariate Gaussian with diagonal Covariance.
        """
        print("----Grad mean", flush=True)

        mean, std = self._distribution.get_mean_and_std()
        diag_std = std

        # Replicate the second to last dimension
        # (B, D, D)
        multiples = [1, self._dims, 1]
        base_samples = to.unsqueeze(dist_samples, -2).repeat(*multiples)

        # Sample (B, D) samples from the positive and negative Univariate Weibull distributions
        weibull = torchdist.weibull.Weibull(scale=np.sqrt(2.),
                                            concentration=2.)
        pos_samples_weibull = weibull.sample(dist_samples.shape)

        if self._coupling:
            neg_samples_weibull = pos_samples_weibull
        else:
            neg_samples_weibull = weibull.sample(dist_samples.shape)

        # Build the (B, D) positive and negative diagonals of the MVD decomposition
        positive_diag = mean + diag_std * pos_samples_weibull
        assert positive_diag.shape == dist_samples.shape

        negative_diag = mean - diag_std * neg_samples_weibull
        assert negative_diag.shape == dist_samples.shape

        # Set the positive and negative points where to evaluate the Q function.
        # (B, D, D)
        # Replace the ith dimension of the actions with the ith entry of the constructed diagonals.
        # Mohamed. S, 2019, Monte Carlo Gradient Estimation in Machine Learning, Ch. 6.2
        positive_samples = base_samples.clone()
        positive_samples.diagonal(dim1=-2, dim2=-1).copy_(positive_diag)
        negative_samples = base_samples.clone()
        negative_samples.diagonal(dim1=-2, dim2=-1).copy_(negative_diag)

        # MVD constant term
        # (B, D)
        c = np.sqrt(2 * np.pi) * diag_std

        # Evaluate the function
        # pos_f_samples = self._func.eval(positive_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims))
        # neg_f_samples = self._func.eval(negative_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims))

        if not self._real_env:
            pos_paramsets = positive_samples.reshape(
                self._n_mc_samples_gradient * self._dims, self._dims)
            pos_f_samples_param_samp_res = self.sampler.sample(
                pos_paramsets.detach())
            r_l = []
            for i in range(len(pos_f_samples_param_samp_res)):
                r_l.append(
                    pos_f_samples_param_samp_res[i].mean_undiscounted_return)
            pos_f_samples = to.tensor(r_l)

            neg_paramsets = negative_samples.reshape(
                self._n_mc_samples_gradient * self._dims, self._dims)
            neg_f_samples_param_samp_res = self.sampler.sample(
                neg_paramsets.detach())
            r_l = []
            for i in range(len(neg_f_samples_param_samp_res)):
                r_l.append(
                    neg_f_samples_param_samp_res[i].mean_undiscounted_return)
            neg_f_samples = to.tensor(r_l)
        else:
            pos_f_samples = to.tensor(
                self.policy_return(
                    positive_samples.reshape(
                        self._n_mc_samples_gradient * self._dims, self._dims)))
            neg_f_samples = to.tensor(
                self.policy_return(
                    negative_samples.reshape(
                        self._n_mc_samples_gradient * self._dims, self._dims)))

        # Gradient batch
        # (B, D)
        delta_f = pos_f_samples - neg_f_samples
        grad = delta_f.reshape(dist_samples.shape[0], self._dims) / c
        assert grad.shape == dist_samples.shape

        return grad

    def _mvd_grad_covariance_gaussian_diagonal_covariance(self, dist_samples):
        """
        Computes the measure valued gradient wrt the covariance of the multivariate Gaussian with diagonal covariance.
        """
        print("----Grad covariance", flush=True)

        mean, std = self._distribution.get_mean_and_std()
        diag_std = std

        # Replicate the second to last dimension of actions
        # (B, D, D)
        multiples = [1, self._dims, 1]
        base_actions = to.unsqueeze(dist_samples, -2).repeat(*multiples)

        # Sample (NxBxDa, Da) samples from the positive and negative Univariate distributions of the decomposition.
        # The positive part is a Double-sided Maxwell M(mu, sigma^2).
        #   M(x; mu, sigma^2) = 1/(sigma*sqrt(2*pi)) * ((x-mu)/sigma)^2 * exp(-1/2*((x-mu)/sigma)^2)
        #   To sample Y from the Double-sided Maxwell M(mu, sigma^2) we can do
        #   X ~ M(0, 1) -> Y = mu + sigma * X
        # The negative part is a Gaussian distribution N(mu, sigma^2).
        #   To sample Y from the Gaussian N(mu, sigma^2) we can do
        #   X ~ N(0, 1) -> Y = mu + sigma * X
        double_sided_maxwell_standard = DoubleSidedStandardMaxwell()
        pos_samples_double_sided_maxwell_standard = double_sided_maxwell_standard.sample(
            dist_samples.shape)

        if self._coupling:
            # Construct standard Gaussian samples from standard Double-sided Maxwell samples
            neg_samples_gaussian_standard = std_gaussian_from_std_dsmaxwell(
                pos_samples_double_sided_maxwell_standard)
        else:
            gaussian_standard = torchdist.normal.Normal(loc=0., scale=1.)
            neg_samples_gaussian_standard = gaussian_standard.sample(
                dist_samples.shape)

        pos_samples_double_sided_maxwell_standard = pos_samples_double_sided_maxwell_standard

        # Build the (B, D) positive and negative diagonals of the MVD decomposition
        positive_diag = mean + diag_std * pos_samples_double_sided_maxwell_standard
        assert positive_diag.shape == dist_samples.shape

        negative_diag = mean + diag_std * neg_samples_gaussian_standard
        assert negative_diag.shape == dist_samples.shape

        # Set the positive and negative points where to evaluate the Q function.
        # (B, D, D)
        # In multivariate Gaussians with diagonal covariance, the univariates are independent.
        # Hence we can replace the ith dimension of the sampled actions with the ith entry of the constructed diagonals.
        # Mohamed. S, 2019, Monte Carlo Gradient Estimation in Machine Learning, Ch. 6.2
        positive_samples = base_actions.clone()
        positive_samples.diagonal(dim1=-2, dim2=-1).copy_(positive_diag)
        negative_samples = base_actions.clone()
        negative_samples.diagonal(dim1=-2, dim2=-1).copy_(negative_diag)

        # MVD constant term
        # (B, D)
        c = diag_std

        # Evaluate the function
        # pos_f_samples = self._func.eval(positive_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims))
        # neg_f_samples = self._func.eval(negative_samples.reshape(self._n_mc_samples_gradient * self._dims, self._dims))

        if not self._real_env:
            pos_paramsets = positive_samples.reshape(
                self._n_mc_samples_gradient * self._dims, self._dims)
            pos_f_samples_param_samp_res = self.sampler.sample(
                pos_paramsets.detach())
            r_l = []
            for i in range(len(pos_f_samples_param_samp_res)):
                r_l.append(
                    pos_f_samples_param_samp_res[i].mean_undiscounted_return)
            pos_f_samples = to.tensor(r_l)

            neg_paramsets = negative_samples.reshape(
                self._n_mc_samples_gradient * self._dims, self._dims)
            neg_f_samples_param_samp_res = self.sampler.sample(
                neg_paramsets.detach())
            r_l = []
            for i in range(len(neg_f_samples_param_samp_res)):
                r_l.append(
                    neg_f_samples_param_samp_res[i].mean_undiscounted_return)
            neg_f_samples = to.tensor(r_l)
        else:
            pos_f_samples = to.tensor(
                self.policy_return(
                    positive_samples.reshape(
                        self._n_mc_samples_gradient * self._dims, self._dims)))
            neg_f_samples = to.tensor(
                self.policy_return(
                    negative_samples.reshape(
                        self._n_mc_samples_gradient * self._dims, self._dims)))

        # Gradient batch
        # (B, D)
        delta_f = pos_f_samples - neg_f_samples
        grad = delta_f.reshape(dist_samples.shape[0], self._dims) / c
        assert grad.shape == dist_samples.shape

        return grad
Exemplo n.º 9
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 eta_mean: float = 1.,
                 eta_std: float = None,
                 symm_sampling: bool = False,
                 transform_returns: bool = True,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param pop_size: number of solutions in the population
        :param eta_mean: step size factor for the mean
        :param eta_std: step size factor for the standard deviation
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.transform_returns = transform_returns

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            # Symmetric buffer needs to have an even number of samples
            if self.pop_size % 2 != 0:
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Utility coefficients (ignored for transform_returns = False)
        # Use pop_size + 1 since we are also considering the current policy
        eta_std = eta_std if eta_std is not None else (
            3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5.
        self.eta_mean_util, self.eta_std_util = self.compute_utilities(
            self.pop_size + 1, eta_mean, eta_std)

        # Learning rates [2]
        # Use pop_size + 1 since we are also considering the current policy
        self.lr_mean = 1. if transform_returns else 1e-2
        self.lr_std = 0.6 * (
            3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)
Exemplo n.º 10
0
class CEM(ParameterExploring):
    r"""
    Cross-Entropy Method (CEM)
    This implementation is basically Algorithm 3.3. in [1] with the addition of decreasing noise [2].
    CEM is closely related to PoWER. The most significant difference is that the importance sampels are not kept over
    iterations and that the covariance matrix is not scaled with the returns, thus allowing for negative returns.

    .. seealso::
        [1] P.T. de Boer, D.P. Kroese, S. Mannor, R.Y. Rubinstein, "A Tutorial on the Cross-Entropy Method",
        Annals OR, 2005
        [2] I. Szita, A. Lörnicz, "Learning Tetris Using the NoisyCross-Entropy Method", Neural Computation, 2006
    """

    name: str = 'cem'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 extra_expl_std_init: float = 0.,
                 extra_expl_decay_iter: int = 10,
                 full_cov: bool = False,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling,
                               indirectly specifies the performance quantile $1 - \rho$ [1]
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal
                                    entries of the covariance matirx, set to 0 to disable this functionality
        :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last
                                      iteration in which the additional exploration noise is applied
        :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values,
                         else a diagonal covariance is used
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not extra_expl_std_init >= 0:
            raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0')
        if not extra_expl_decay_iter > 0:
            raise pyrado.ValueErr(given=extra_expl_decay_iter,
                                  g_constraint='0')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        if not num_is_samples <= pop_size:
            raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size)
        self.num_is_samples = int(num_is_samples)

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=full_cov,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Optionally add additional entropy
        self.extra_expl_decay_iter = extra_expl_decay_iter
        if isinstance(self._expl_strat.noise, DiagNormalNoise):
            self.extra_expl_std_init = to.ones_like(
                self._policy.param_values) * extra_expl_std_init
        elif isinstance(self._expl_strat.noise, FullNormalNoise):
            self.extra_expl_std_init = to.eye(
                self._policy.num_param) * extra_expl_std_init
        else:
            raise pyrado.TypeErr(
                msg=
                'Additional exploration entropy is only implemented for Gaussian distributions,'
                'i.e. DiagNormalNoise and FullNormalNoise')

    @to.no_grad()
    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = to.tensor(param_results.mean_returns)

        # Descending sort according to return values and the importance samples a.k.a. elites (see [1, p.12])
        idcs_dcs = to.argsort(rets_avg_ros, descending=True)
        idcs_dcs = idcs_dcs[:self.num_is_samples]
        rets_avg_is = rets_avg_ros[idcs_dcs]
        params_is = param_results.parameters[idcs_dcs, :]

        # Update the policy parameters from the mean importance samples
        self._policy.param_values = to.mean(params_is, dim=0)

        # Update the exploration covariance from the empirical variance of the importance samples
        if isinstance(self._expl_strat.noise, DiagNormalNoise):
            std_is = to.std(params_is, dim=0)
            extra_expl_std = self.extra_expl_std_init * max(
                1. - self._curr_iter / self.extra_expl_decay_iter,
                0  # see [2, p.4]
            )
            self._expl_strat.noise.adapt(std=std_is + extra_expl_std)
        elif isinstance(self._expl_strat.noise, FullNormalNoise):
            cov_is = cov(params_is, data_along_rows=True)
            extra_expl_cov = to.pow(self.extra_expl_std_init, 2) * max(
                1. - self._curr_iter / self.extra_expl_decay_iter,
                0  # see [2, p.4]
            )
            self._expl_strat.noise.adapt(cov=cov_is + extra_expl_cov)

        # Logging
        self.logger.add_value('median imp samp return', to.median(rets_avg_is),
                              4)
        self.logger.add_value('min imp samp return', to.min(rets_avg_is), 4)
        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std), 4)
        self.logger.add_value('avg expl strat std',
                              to.mean(self._expl_strat.std), 4)
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std), 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)
Exemplo n.º 11
0
class NES(ParameterExploring):
    """
    Simplified variant of Natural Evolution Strategies (NES)

    .. seealso::
        [1] D. Wierstra, T. Schaul, T. Glasmachers, Y. Sun, J. Peters, J. Schmidhuber, "Natural Evolution Strategies",
        JMLR, 2014

        [2] This implementation was inspired by https://github.com/pybrain/pybrain/blob/master/pybrain/optimization/distributionbased/snes.py
    """

    name: str = 'nes'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 eta_mean: float = 1.,
                 eta_std: float = None,
                 symm_sampling: bool = False,
                 transform_returns: bool = True,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param pop_size: number of solutions in the population
        :param eta_mean: step size factor for the mean
        :param eta_std: step size factor for the standard deviation
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.transform_returns = transform_returns

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            # Symmetric buffer needs to have an even number of samples
            if self.pop_size % 2 != 0:
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Utility coefficients (ignored for transform_returns = False)
        # Use pop_size + 1 since we are also considering the current policy
        eta_std = eta_std if eta_std is not None else (
            3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5.
        self.eta_mean_util, self.eta_std_util = self.compute_utilities(
            self.pop_size + 1, eta_mean, eta_std)

        # Learning rates [2]
        # Use pop_size + 1 since we are also considering the current policy
        self.lr_mean = 1. if transform_returns else 1e-2
        self.lr_std = 0.6 * (
            3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)

    @staticmethod
    def compute_utilities(pop_size: Optional[int], eta_mean: float,
                          eta_std: float):
        """
        Compute the utilities as described in section 3.1 of [1] (a.k.a. Hansen ranking with uniform baseline)

        :param pop_size: number of solutions in the population
        :param eta_mean: step size factor for the mean
        :param eta_std: step size factor for the standard deviation
        :return: utility coefficient for the mean, and utility coefficient for the standard deviation
        """
        # Compute common utility vector
        log_half = np.log(pop_size / 2. + 1)
        log_k = np.log(np.arange(1, pop_size + 1))
        num = np.maximum(0, log_half - log_k)
        utils = num / np.sum(num) - 1. / pop_size

        # Convert to PyTorch tensors
        eta_mean_util = to.from_numpy(eta_mean * utils).to(
            to.get_default_dtype())
        eta_std_util = to.from_numpy(eta_std / 2. * utils).to(
            to.get_default_dtype())
        return eta_mean_util, eta_std_util

    def update(self,
               param_results: ParameterSamplingResult,
               ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results.mean_returns

        # Get the perturbations (deltas from the current policy parameters)
        s = param_results.parameters - self._policy.param_values
        # also divide by the standard deviation to fully standardize
        s /= self._expl_strat.std

        if self.transform_returns:
            # Ascending sort according to return values
            idcs_acs = np.argsort(rets_avg_ros)[::-1]
            s_asc = s[list(idcs_acs), :]

            # Update the mean (see [1, 2])
            delta_mean = self._expl_strat.std * (self.eta_mean_util @ s_asc)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (see [1, 2])
            grad_std = self.eta_std_util @ (s_asc**2 - 1.)
            new_std = self._expl_strat.std * to.exp(
                self.lr_std * grad_std / 2.)
            self._expl_strat.adapt(std=new_std)

        else:
            # Standardize averaged returns over all pop_size rollouts
            rets_stdized = standardize(rets_avg_ros)
            rets_stdized = to.from_numpy(rets_stdized).to(
                to.get_default_dtype())

            # delta_mean = 1./len(param_results) * (rets_stdized @ s)
            delta_mean = 1. / (self._expl_strat.std *
                               len(param_results)) * (rets_stdized @ s)
            self._policy.param_values += self.lr_mean * delta_mean

            # Update the std (monotonous exponential decay)
            new_std = self._expl_strat.std * 0.999**self._curr_iter
            self._expl_strat.adapt(std=new_std)

        self.logger.add_value('min expl strat std',
                              to.min(self._expl_strat.std), 4)
        self.logger.add_value('avg expl strat std',
                              to.mean(self._expl_strat.std), 4)
        self.logger.add_value('max expl strat std',
                              to.max(self._expl_strat.std), 4)
        self.logger.add_value('expl strat entropy',
                              self._expl_strat.get_entropy(), 4)
Exemplo n.º 12
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 eps: float,
                 gamma: float,
                 num_rollouts: int,
                 pop_size: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_sampler_envs: int = 4,
                 num_epoch_dual: int = 1000,
                 use_map: bool = False,
                 grad_free_optim: bool = False,
                 lr_dual: float = 5e-4,
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param gamma: temporal discount factor; equal to 1 - reset probability
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_epoch_dual: number of epochs for the minimization of the dual function
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer
        :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`)
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        if not isinstance(policy, LinearPolicy):
            warn('REPS is designed for linear policies only!', UserWarning)

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        # Store the inputs
        self.eps = eps
        self.gamma = gamma
        self.base_seed = base_seed
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        self.kappa = to.tensor([0.], requires_grad=True)  # eta = exp(kappa)
        self._exp_min = -700.
        self._exp_max = 700.

        # Dual specific
        if grad_free_optim:
            self.optim_dual = GSS(
                [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4]))
            )
        else:
            self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5)
            # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4)
        self.num_epoch_dual = num_epoch_dual
Exemplo n.º 13
0
class REPS(ParameterExploring):
    """
    Episodic variant of Relative Entropy Policy Search (REPS)

    .. seealso::
        [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010

        [2] This implementation was inspired by https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py
    """

    name: str = 'reps'

    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 eps: float,
                 gamma: float,
                 num_rollouts: int,
                 pop_size: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_sampler_envs: int = 4,
                 num_epoch_dual: int = 1000,
                 use_map: bool = False,
                 grad_free_optim: bool = False,
                 lr_dual: float = 5e-4,
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param gamma: temporal discount factor; equal to 1 - reset probability
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_epoch_dual: number of epochs for the minimization of the dual function
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer
        :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`)
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        if not isinstance(policy, LinearPolicy):
            warn('REPS is designed for linear policies only!', UserWarning)

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        # Store the inputs
        self.eps = eps
        self.gamma = gamma
        self.base_seed = base_seed
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        self.kappa = to.tensor([0.], requires_grad=True)  # eta = exp(kappa)
        self._exp_min = -700.
        self._exp_max = 700.

        # Dual specific
        if grad_free_optim:
            self.optim_dual = GSS(
                [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4]))
            )
        else:
            self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5)
            # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4)
        self.num_epoch_dual = num_epoch_dual

    @property
    def eta(self) -> to.Tensor:
        r""" Get $\eta = e^{\kappa}$. """
        return to.exp(self.kappa)

    def weights(self, rets: to.Tensor) -> to.Tensor:
        """
        Compute the wights which are used to weights thy policy samples by their return

        :param rets: return values per policy sample after averaging over multiple rollouts using the same policy
        """
        shifted_rets = rets - to.max(rets)
        return to.exp(to.clamp(shifted_rets / self.eta, self._exp_min, self._exp_max))

    def dual(self, rets: to.Tensor) -> to.Tensor:
        """
        Compute the REPS dual function value.

        :param: dual loss value
        """
        w = self.weights(rets)
        return self.eta * self.eps + to.max(rets) + self.eta * to.log(to.mean(w))

    def policy_dual(self, param_samples: to.Tensor, w: to.Tensor) -> to.Tensor:
        """
        Compute the REPS policy-dual function value.

        :param param_samples:
        :param w: sample weights
        :return: dual loss value
        """
        distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov)
        self.wml(param_samples, w, eta=self.eta)

        distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov)
        logprobs = distr_new.log_prob(param_samples)
        kl_e = kl_divergence(distr_new, distr_old)  # mode seeking a.k.a. exclusive KL

        return w @ logprobs + self.eta * (self.eps - kl_e)

    def minimize(self,
                 loss_fcn: Callable,
                 rets: to.Tensor = None,
                 param_samples: to.Tensor = None,
                 w: to.Tensor = None):
        """
        Minimize the given dual function. Iterate num_epoch_dual times.

        :param loss_fcn: function to minimize
        :param rets: return values per policy sample after averaging over multiple rollouts using the same policy
        :param param_samples: all sampled policy parameters
        :param w: sample weights
        """
        if isinstance(self.optim_dual, GSS):
            self.optim_dual.reset()

        for _ in tqdm(range(self.num_epoch_dual), total=self.num_epoch_dual, desc=f'Minimizing dual', unit='epochs',
                      file=sys.stdout, leave=False):
            if not isinstance(self.optim_dual, GSS):
                # Reset the gradients
                self.optim_dual.zero_grad()

            # Compute value function loss
            if rets is not None and param_samples is None and w is None:
                loss = loss_fcn(rets)  # dual
            elif rets is None and param_samples is not None and w is not None:
                loss = loss_fcn(param_samples, w)  # policy dual
            else:
                raise NotImplementedError

            # Update the parameter
            if isinstance(self.optim_dual, GSS):
                if rets is not None and param_samples is None and w is None:
                    self.optim_dual.step(closure=functools.partial(loss_fcn, rets=rets))
                elif rets is None and param_samples is not None and w is not None:
                    self.optim_dual.step(closure=functools.partial(loss_fcn, param_samples=param_samples, w=w))
                else:
                    raise NotImplementedError

            else:
                loss.backward()
                self.optim_dual.step()

        if to.isnan(self.kappa):
            raise RuntimeError(f"The dual's optimization parameter kappa became NaN!")

    def wml(self, param_samples: to.Tensor, w: to.Tensor, eta: to.Tensor = to.tensor([0.])):
        """
        Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance

        :param param_samples: all sampled policy parameters
        :param w: sample weights
        :param eta: dual parameters
        """
        mean_old = self._policy.param_values.clone()
        cov_old = self._expl_strat.cov.clone()

        # Update the mean
        self._policy.param_values = (eta * mean_old + to.sum(w.view(-1, 1) * param_samples, dim=0)) / (to.sum(w) + eta)
        param_values_delta = self._policy.param_values - mean_old

        # Difference between all sampled policy parameters and the updated policy
        diff = param_samples - self._policy.param_values
        w_diff = to.einsum('nk,n,nh->kh', diff, w, diff)  # outer product of scaled diff, then sum over all samples

        # Update the covariance
        cov_new = (w_diff + eta * cov_old + eta * to.einsum('k,h->kh', param_values_delta, param_values_delta)
                   ) / (to.sum(w) + eta)
        self._expl_strat.adapt(cov=cov_new)

    def wmap(self, param_samples: to.Tensor, w: to.Tensor):
        """
        Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance

        :param param_samples: all sampled policy parameters
        :param w: sample weights
        """
        # Optimize for eta
        self.minimize(self.policy_dual, param_samples=param_samples, w=w.detach())
        # Update policy parameters
        self.wml(param_samples, w.detach(), eta=self.eta)

    def update(self, param_results: ParameterSamplingResult, ret_avg_curr: float = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results.mean_returns
        rets_avg_ros = to.from_numpy(rets_avg_ros)

        # Reset dual's parameter
        self.kappa.data.fill_(0.)

        # Dual
        with to.no_grad():
            distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov)
            loss = self.dual(rets_avg_ros)
            self.logger.add_value('dual loss before', loss.item())

        self.minimize(self.dual, rets=rets_avg_ros)

        with to.no_grad():
            loss = self.dual(rets_avg_ros)
            self.logger.add_value('dual loss after', loss.item())
            self.logger.add_value('eta', self.eta.item())

        # Compute the weights using the optimized eta
        w = self.weights(rets_avg_ros)

        # Update the policy's mean and the exploration strategy's covariance
        if self.use_map:
            self.wml(param_results.parameters, w)
        else:
            self.wmap(param_results.parameters, w)

        # Logging
        distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov)
        kl_e = kl_divergence(distr_new, distr_old)  # mode seeking a.k.a. exclusive KL
        kl_i = kl_divergence(distr_old, distr_new)  # mean seeking a.k.a. inclusive KL
        self.logger.add_value('min expl strat std', to.min(self._expl_strat.std))
        self.logger.add_value('avg expl strat std', to.mean(self._expl_strat.std.data).detach().numpy())
        self.logger.add_value('max expl strat std', to.max(self._expl_strat.std))
        self.logger.add_value('expl strat entropy', self._expl_strat.get_entropy().item())
        self.logger.add_value('KL(new_old)', kl_e.item())
        self.logger.add_value('KL(old_new)', kl_i.item())
Exemplo n.º 14
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        eps: float,
        num_init_states_per_domain: int,
        pop_size: Optional[int],
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        softmax_transform: bool = False,
        use_map: bool = True,
        optim_mode: Optional[str] = "scipy",
        num_epoch_dual: int = 1000,
        lr_dual: float = 5e-4,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP
                           optimizer from scipy (recommended)
        :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if
                               `optim_mode = 'scipy'`
        :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'`
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)):
            print_cbt_once("REPS was designed for linear policies.", "y")

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Store the inputs
        self.eps = eps
        self.softmax_transform = softmax_transform
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=self._policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Dual optimization
        self.num_epoch_dual = num_epoch_dual
        self._log_eta = to.tensor([0.0], requires_grad=True)
        self.optim_mode = optim_mode.lower()
        if self.optim_mode == "scipy":
            pass
        elif self.optim_mode == "torch":
            self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4)
            # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5)  # used in [2], but unstable here
        else:
            raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])
Exemplo n.º 15
0
class REPS(ParameterExploring):
    """
    Episodic variant of Relative Entropy Policy Search (REPS)

    .. note::
        REPS was designed for linear policies.

    .. seealso::
        [1] J. Peters, K. Mülling, Y. Altuen, "Relative Entropy Policy Search", AAAI, 2010
        [2] A. Abdolmaleki, J.T. Springenberg, J. Degrave, S. Bohez, Y. Tassa, D. Belov, N. Heess, M. Riedmiller,
            "Relative Entropy Regularized Policy Iteration", arXiv, 2018
        [3] This implementation is inspired by the work of H. Abdulsamad
            https://github.com/hanyas/rl/blob/master/rl/ereps/ereps.py
    """

    name: Optional[str] = "reps"

    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        eps: float,
        num_init_states_per_domain: int,
        pop_size: Optional[int],
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        softmax_transform: bool = False,
        use_map: bool = True,
        optim_mode: Optional[str] = "scipy",
        num_epoch_dual: int = 1000,
        lr_dual: float = 5e-4,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP
                           optimizer from scipy (recommended)
        :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if
                               `optim_mode = 'scipy'`
        :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'`
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)):
            print_cbt_once("REPS was designed for linear policies.", "y")

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Store the inputs
        self.eps = eps
        self.softmax_transform = softmax_transform
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=self._policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Dual optimization
        self.num_epoch_dual = num_epoch_dual
        self._log_eta = to.tensor([0.0], requires_grad=True)
        self.optim_mode = optim_mode.lower()
        if self.optim_mode == "scipy":
            pass
        elif self.optim_mode == "torch":
            self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4)
            # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5)  # used in [2], but unstable here
        else:
            raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])

    @property
    def eta(self) -> to.Tensor:
        r"""Get the Lagrange multiplier $\eta$. In [2], $/eta$ is called $/alpha$."""
        return to.exp(self._log_eta)

    def weights(self, rets: to.Tensor) -> to.Tensor:
        """
        Compute the wights which are used to weights thy policy samples by their return.
        As stated in [2, sec 4.1], we could calculate weights using any rank preserving transformation.

        :param rets: return values per policy sample after averaging over multiple rollouts using the same policy
        :return: weights of the policy parameter samples
        """
        if self.softmax_transform:
            # Do softmax transform (softmax from PyTorch is already numerically stable)
            return to.softmax(rets / self.eta, dim=0)
        else:
            # Do numerically stabilized exp transform
            return to.exp(to.clamp((rets - to.max(rets)) / self.eta, min=-700.0))

    def dual_evaluation(
        self, eta: Union[to.Tensor, np.ndarray], rets: Union[to.Tensor, np.ndarray]
    ) -> Union[to.Tensor, np.ndarray]:
        """
        Compute the REPS dual function value for policy evaluation.

        :param eta: lagrangian multiplier (optimization variable of the dual)
        :param rets: return values per policy sample after averaging over multiple rollouts using the same policy
        :return: dual loss value
        """
        if not (
            isinstance(eta, to.Tensor)
            and isinstance(rets, to.Tensor)
            or isinstance(eta, np.ndarray)
            and isinstance(rets, np.ndarray)
        ):
            raise pyrado.TypeErr(msg="")
        return eta * self.eps + eta * logmeanexp(rets / eta)

    def dual_improvement(
        self, eta: Union[to.Tensor, np.ndarray], param_samples: to.Tensor, w: to.Tensor
    ) -> Union[to.Tensor, np.ndarray]:
        """
        Compute the REPS dual function value for policy improvement.

        :param eta: lagrangian multiplier (optimization variable of the dual)
        :param param_samples: all sampled policy parameters
        :param w: weights of the policy parameter samples
        :return: dual loss value
        """
        # The sample weights have been computed by minimizing dual_evaluation, don't track the gradient twice
        assert w.requires_grad is False

        with to.no_grad():
            distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data)

            if self.optim_mode == "scipy" and not isinstance(eta, to.Tensor):
                # We can arrive there during the 'normal' REPS routine, but also when computing the gradient (jac) for
                # the scipy optimizer. In the latter case, eta is already a tensor.
                eta = to.from_numpy(eta).to(to.get_default_dtype())
            self.wml(eta, param_samples, w)

            distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data)
            logprobs = distr_new.log_prob(param_samples)
            kl = kl_divergence(distr_new, distr_old)  # mode seeking a.k.a. exclusive KL

        if self.optim_mode == "scipy":
            loss = w.numpy() @ logprobs.numpy() + eta * (self.eps - kl.numpy())
        else:
            loss = w @ logprobs + eta * (self.eps - kl)
        return loss

    def minimize(
        self, loss_fcn: Callable, rets: to.Tensor = None, param_samples: to.Tensor = None, w: to.Tensor = None
    ):
        """
        Minimize the given dual function. This function can be called for the dual evaluation loss or the dual
        improvement loss.

        :param loss_fcn: function to minimize, different for `wml()` and `wmap()`
        :param rets: return values per policy sample after averaging over multiple rollouts using the same policy
        :param param_samples: all sampled policy parameters
        :param w: weights of the policy parameter samples
        """
        if self.optim_mode == "scipy":
            # Use scipy optimizers
            if loss_fcn == self.dual_evaluation:
                res = optimize.minimize(
                    partial(self.dual_evaluation, rets=rets.numpy()),
                    jac=partial(get_grad_via_torch, fcn_to=partial(self.dual_evaluation, rets=rets)),
                    x0=np.array([1.0]),
                    method="SLSQP",
                    bounds=((1e-8, 1e8),),
                )
            elif loss_fcn == self.dual_improvement:
                res = optimize.minimize(
                    partial(self.dual_improvement, param_samples=param_samples, w=w),
                    jac=partial(
                        get_grad_via_torch, fcn_to=partial(self.dual_improvement, param_samples=param_samples, w=w)
                    ),
                    x0=np.array([1.0]),
                    method="SLSQP",
                    bounds=((1e-8, 1e8),),
                )
            else:
                raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!")

            eta = to.from_numpy(res["x"]).to(to.get_default_dtype())
            self._log_eta = to.log(eta)

        else:
            for _ in tqdm(
                range(self.num_epoch_dual),
                total=self.num_epoch_dual,
                desc=f"Minimizing dual",
                unit="epochs",
                file=sys.stdout,
                leave=False,
            ):
                # Use PyTorch optimizers
                self.optim_dual.zero_grad()
                if loss_fcn == self.dual_evaluation:
                    loss = self.dual_evaluation(self.eta, rets)
                elif loss_fcn == self.dual_improvement:
                    loss = self.dual_improvement(self.eta, param_samples, w)
                else:
                    raise pyrado.TypeErr(msg="Received an improper loss function in REPS.minimize()!")
                loss.backward()
                self.optim_dual.step()

        if to.isnan(self._log_eta):
            raise RuntimeError(f"The dual's optimization parameter _log_eta became NaN!")

    def wml(self, eta: to.Tensor, param_samples: to.Tensor, w: to.Tensor):
        """
        Weighted maximum likelihood update of the policy's mean and the exploration strategy's covariance

        :param eta: lagrangian multiplier (optimization variable of the dual)
        :param param_samples: all sampled policy parameters
        :param w: weights of the policy parameter samples
        """
        mean_old = self._policy.param_values.clone()
        cov_old = self._expl_strat.cov.clone()

        # Update the mean
        w_sum_param_samples = to.einsum("k,kh->h", w, param_samples)
        self._policy.param_values = (eta * mean_old + w_sum_param_samples) / (to.sum(w) + eta)
        param_values_delta = self._policy.param_values - mean_old

        # Difference between all sampled policy parameters and the updated policy
        diff = param_samples - self._policy.param_values
        w_diff = to.einsum("nk,n,nh->kh", diff, w, diff)  # outer product of scaled diff, then sum over all samples

        # Update the covariance
        cov_new = (w_diff + eta * cov_old + eta * to.einsum("k,h->kh", param_values_delta, param_values_delta)) / (
            to.sum(w) + eta
        )
        self._expl_strat.adapt(cov=cov_new)

    def wmap(self, param_samples: to.Tensor, w: to.Tensor):
        """
        Weighted maximum a-posteriori likelihood update of the policy's mean and the exploration strategy's covariance

        :param param_samples: all sampled policy parameters
        :param w: weights of the policy parameter samples
        """
        # Optimize eta according to the the policy's dual function to satisfy the KL constraint
        self.minimize(self.dual_improvement, param_samples=param_samples, w=w.detach())

        # Update the policy's and exploration strategy's parameters
        self.wml(self.eta, param_samples, w.detach())

    def update(self, param_results: ParameterSamplingResult, ret_avg_curr: Optional[float] = None):
        # Average the return values over the rollouts
        rets_avg_ros = param_results.mean_returns
        rets_avg_ros = to.from_numpy(rets_avg_ros).to(to.get_default_dtype())

        with to.no_grad():
            distr_old = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data)
            loss = self.dual_evaluation(self.eta, rets_avg_ros)
            self.logger.add_value("dual loss before", loss, 4)

        # Reset dual's parameter
        self._log_eta.data.fill_(0.0)

        # Optimize eta
        self.minimize(self.dual_evaluation, rets=rets_avg_ros)

        with to.no_grad():
            loss = self.dual_evaluation(self.eta, rets_avg_ros)
            self.logger.add_value("dual loss after", loss, 4)
            self.logger.add_value("eta", self.eta, 4)

        # Compute the weights using the optimized eta
        w = self.weights(rets_avg_ros)

        # Update the policy's mean and the exploration strategy's covariance
        if self.use_map:
            self.wmap(param_results.parameters, w)  # calls self.wml(param_results.parameters, w)
        else:
            self.wml(self.eta, param_results.parameters, w)

        # Logging
        distr_new = MultivariateNormal(self._policy.param_values, self._expl_strat.cov.data)
        kl_e = kl_divergence(distr_new, distr_old)  # mode seeking a.k.a. exclusive KL
        kl_i = kl_divergence(distr_old, distr_new)  # mean seeking a.k.a. inclusive KL
        self.logger.add_value("min expl strat std", to.min(self._expl_strat.std), 4)
        self.logger.add_value("avg expl strat std", to.mean(self._expl_strat.std), 4)
        self.logger.add_value("max expl strat std", to.max(self._expl_strat.std), 4)
        self.logger.add_value("expl strat entropy", self._expl_strat.get_entropy(), 4)
        self.logger.add_value("KL(new_old)", kl_e, 6)
        self.logger.add_value("KL(old_new)", kl_i, 6)