Exemplo n.º 1
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, LinearPolicy):
            print_cbt_once('PoWER was designed for linear policies.', 'y')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = 1e-6 * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Exemplo n.º 2
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        pop_size: Optional[int],
        num_init_states_per_domain: int,
        num_is_samples: int,
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param pop_size: number of solutions in the population
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Initialize memory for importance sampling
        self._bound_lo_ret = 1e-3  # the returns must not be negative, clip them to this value if so
        self.num_is_samples = min(pop_size, num_is_samples)
        self.is_mem_ret = self._bound_lo_ret * to.ones(
            self.num_is_samples
        )  # has to be initialized > 0 due to first covariance update
        self.is_mem_params = to.zeros(self.num_is_samples,
                                      self._policy.num_param)
        self.is_mem_W = to.zeros(self.num_is_samples, self._policy.num_param,
                                 self._policy.num_param)
Exemplo n.º 3
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: Optional[int] = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 lr: float = 5e-4,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param lr: learning rate
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        self.optim = to.optim.SGD([{
            'params': self._policy.parameters()
        }],
                                  lr=lr,
                                  momentum=0.8,
                                  dampening=0.1)
Exemplo n.º 4
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 distribution,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 clip_ratio_std: float = 0.05,
                 normalize_update: bool = False,
                 transform_returns: bool = True,
                 num_sampler_envs: int = 4,
                 n_mc_samples_gradient=1,
                 coupling=True,
                 real_env=False,
                 lr: float = 5e-4,
                 optim: str = 'SGD',
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param clip_ratio_std: maximal ratio for the change of the exploration strategy's standard deviation
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        self._distribution = distribution
        self._dims = distribution.get_number_of_dims()

        self._n_mc_samples_gradient = n_mc_samples_gradient
        self._coupling = coupling

        self._real_env = real_env

        # Store the inputs
        self.clip_ratio_std = clip_ratio_std
        self.normalize_update = normalize_update
        self.transform_returns = transform_returns
        self.lr = lr

        # Exploration strategy based on symmetrical normally distributed noise
        if self.pop_size % 2 != 0:
            # Symmetric buffer needs to have an even number of samples
            self.pop_size += 1
        self._expl_strat = SymmParamExplStrat(
            NormalParamNoise(
                self._policy.num_param,
                std_init=expl_std_init,
                std_min=expl_std_min,
            ))

        if optim == 'SGD':
            self.optim = to.optim.SGD([{
                'params': self._policy.parameters()
            }],
                                      lr=lr,
                                      momentum=0.8,
                                      dampening=0.1)
        elif optim == 'Adam':
            # self.optim = to.optim.Adam([{'params': self._policy.parameters()}], lr=lr)
            self.optim = to.optim.Adam(
                [{
                    'params': self._distribution.get_params()
                }], lr=lr)
        else:
            raise NotImplementedError

        self._iter = 0
Exemplo n.º 5
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 pop_size: Optional[int],
                 num_rollouts: int,
                 num_is_samples: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 extra_expl_std_init: float = 0.,
                 extra_expl_decay_iter: int = 10,
                 full_cov: bool = False,
                 symm_sampling: bool = False,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per policy sample
        :param num_is_samples: number of samples (policy parameter sets & returns) for importance sampling,
                               indirectly specifies the performance quantile $1 - \rho$ [1]
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param extra_expl_std_init: additional standard deviation for the parameter exploration added to the diagonal
                                    entries of the covariance matirx, set to 0 to disable this functionality
        :param extra_expl_decay_iter: limit for the linear decay of the additional standard deviation, i.e. last
                                      iteration in which the additional exploration noise is applied
        :param full_cov: pass `True` to compute a full covariance matrix for sampling the next policy parameter values,
                         else a diagonal covariance is used
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not extra_expl_std_init >= 0:
            raise pyrado.ValueErr(given=extra_expl_std_init, ge_constraint='0')
        if not extra_expl_decay_iter > 0:
            raise pyrado.ValueErr(given=extra_expl_decay_iter,
                                  g_constraint='0')

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        if not num_is_samples <= pop_size:
            raise pyrado.ValueErr(given=num_is_samples, le_constraint=pop_size)
        self.num_is_samples = int(num_is_samples)

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=full_cov,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Optionally add additional entropy
        self.extra_expl_decay_iter = extra_expl_decay_iter
        if isinstance(self._expl_strat.noise, DiagNormalNoise):
            self.extra_expl_std_init = to.ones_like(
                self._policy.param_values) * extra_expl_std_init
        elif isinstance(self._expl_strat.noise, FullNormalNoise):
            self.extra_expl_std_init = to.eye(
                self._policy.num_param) * extra_expl_std_init
        else:
            raise pyrado.TypeErr(
                msg=
                'Additional exploration entropy is only implemented for Gaussian distributions,'
                'i.e. DiagNormalNoise and FullNormalNoise')
Exemplo n.º 6
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 num_rollouts: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 pop_size: int = None,
                 eta_mean: float = 1.,
                 eta_std: float = None,
                 symm_sampling: bool = False,
                 transform_returns: bool = True,
                 num_workers: int = 4,
                 logger: Optional[StepLogger] = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param num_rollouts: number of rollouts per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param pop_size: number of solutions in the population
        :param eta_mean: step size factor for the mean
        :param eta_std: step size factor for the standard deviation
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param transform_returns: use a rank-transformation of the returns to update the policy
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        # Call ParameterExploring's constructor
        super().__init__(save_dir,
                         env,
                         policy,
                         max_iter,
                         num_rollouts,
                         pop_size=pop_size,
                         num_workers=num_workers,
                         logger=logger)

        # Store the inputs
        self.transform_returns = transform_returns

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            # Symmetric buffer needs to have an even number of samples
            if self.pop_size % 2 != 0:
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Utility coefficients (ignored for transform_returns = False)
        # Use pop_size + 1 since we are also considering the current policy
        eta_std = eta_std if eta_std is not None else (
            3 + np.log(policy.num_param)) / np.sqrt(self.pop_size + 1) / 5.
        self.eta_mean_util, self.eta_std_util = self.compute_utilities(
            self.pop_size + 1, eta_mean, eta_std)

        # Learning rates [2]
        # Use pop_size + 1 since we are also considering the current policy
        self.lr_mean = 1. if transform_returns else 1e-2
        self.lr_std = 0.6 * (
            3 + np.log(self.pop_size + 1)) / 3. / np.sqrt(self.pop_size + 1)
Exemplo n.º 7
0
    def __init__(self,
                 save_dir: str,
                 env: Env,
                 policy: Policy,
                 max_iter: int,
                 eps: float,
                 gamma: float,
                 num_rollouts: int,
                 pop_size: int,
                 expl_std_init: float,
                 expl_std_min: float = 0.01,
                 symm_sampling: bool = False,
                 num_sampler_envs: int = 4,
                 num_epoch_dual: int = 1000,
                 use_map: bool = False,
                 grad_free_optim: bool = False,
                 lr_dual: float = 5e-4,
                 base_seed: int = None):
        """
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param gamma: temporal discount factor; equal to 1 - reset probability
        :param pop_size: number of solutions in the population
        :param num_rollouts: number of rollouts per per policy sample
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param num_epoch_dual: number of epochs for the minimization of the dual function
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param grad_free_optim: use a derivative free optimizer (e.g. golden section search) or a SGD-based optimizer
        :param lr_dual: learning rate for the dual's optimizer (ignored if `grad_free_optim = True`)
        :param base_seed: seed added to all other seeds in order to make the experiments distinct but repeatable
        """
        if not isinstance(policy, LinearPolicy):
            warn('REPS is designed for linear policies only!', UserWarning)

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir,
            env,
            policy,
            max_iter,
            num_rollouts,
            pop_size=pop_size,
            base_seed=base_seed,
            num_sampler_envs=num_sampler_envs,
        )

        # Store the inputs
        self.eps = eps
        self.gamma = gamma
        self.base_seed = base_seed
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        self.kappa = to.tensor([0.], requires_grad=True)  # eta = exp(kappa)
        self._exp_min = -700.
        self._exp_max = 700.

        # Dual specific
        if grad_free_optim:
            self.optim_dual = GSS(
                [{'params': self.kappa}], param_min=to.log(to.tensor([1e-4])), param_max=to.log(to.tensor([1e4]))
            )
        else:
            self.optim_dual = to.optim.Adam([{'params': self.kappa}], lr=lr_dual, eps=1e-5)
            # self.optim_dual = to.optim.SGD([{'params': self.kappa}], lr=lr_dual, momentum=0.7, weight_decay=1e-4)
        self.num_epoch_dual = num_epoch_dual
Exemplo n.º 8
0
    def __init__(
        self,
        save_dir: pyrado.PathLike,
        env: Env,
        policy: Policy,
        max_iter: int,
        eps: float,
        num_init_states_per_domain: int,
        pop_size: Optional[int],
        expl_std_init: float,
        expl_std_min: float = 0.01,
        num_domains: int = 1,
        symm_sampling: bool = False,
        softmax_transform: bool = False,
        use_map: bool = True,
        optim_mode: Optional[str] = "scipy",
        num_epoch_dual: int = 1000,
        lr_dual: float = 5e-4,
        num_workers: int = 4,
        logger: Optional[StepLogger] = None,
    ):
        r"""
        Constructor

        :param save_dir: directory to save the snapshots i.e. the results in
        :param env: the environment which the policy operates
        :param policy: policy to be updated
        :param eps: bound on the KL divergence between policy updates, e.g. 0.1
        :param max_iter: maximum number of iterations (i.e. policy updates) that this algorithm runs
        :param pop_size: number of solutions in the population
        :param num_init_states_per_domain: number of rollouts to cover the variance over initial states
        :param num_domains: number of rollouts due to the variance over domain parameters
        :param expl_std_init: initial standard deviation for the exploration strategy
        :param expl_std_min: minimal standard deviation for the exploration strategy
        :param symm_sampling: use an exploration strategy which samples symmetric populations
        :param softmax_transform: pass `True` to use a softmax to transform the returns, else use a shifted exponential
        :param use_map: use maximum a-posteriori likelihood (`True`) or maximum likelihood (`False`) update rule
        :param optim_mode: choose the type of optimizer: 'torch' for a SGD-based optimizer or 'scipy' for the SLSQP
                           optimizer from scipy (recommended)
        :param num_epoch_dual: number of epochs for the minimization of the dual functions, ignored if
                               `optim_mode = 'scipy'`
        :param lr_dual: learning rate for the dual's optimizer, ignored if `optim_mode = 'scipy'`
        :param num_workers: number of environments for parallel sampling
        :param logger: logger for every step of the algorithm, if `None` the default logger will be created
        """
        if not isinstance(policy, (LinearPolicy, DomainDistrParamPolicy)):
            print_cbt_once("REPS was designed for linear policies.", "y")

        # Call ParameterExploring's constructor
        super().__init__(
            save_dir=save_dir,
            env=env,
            policy=policy,
            max_iter=max_iter,
            num_init_states_per_domain=num_init_states_per_domain,
            num_domains=num_domains,
            pop_size=pop_size,
            num_workers=num_workers,
            logger=logger,
        )

        # Store the inputs
        self.eps = eps
        self.softmax_transform = softmax_transform
        self.use_map = use_map

        # Explore using normal noise
        self._expl_strat = NormalParamNoise(
            self._policy.num_param,
            full_cov=True,
            std_init=expl_std_init,
            std_min=expl_std_min,
            use_cuda=self._policy.device != "cpu",
        )
        if symm_sampling:
            # Exploration strategy based on symmetrical normally distributed noise
            if self.pop_size % 2 != 0:
                # Symmetric buffer needs to have an even number of samples
                self.pop_size += 1
            self._expl_strat = SymmParamExplStrat(self._expl_strat)

        # Dual optimization
        self.num_epoch_dual = num_epoch_dual
        self._log_eta = to.tensor([0.0], requires_grad=True)
        self.optim_mode = optim_mode.lower()
        if self.optim_mode == "scipy":
            pass
        elif self.optim_mode == "torch":
            self.optim_dual = to.optim.SGD([{"params": self._log_eta}], lr=lr_dual, momentum=0.8, weight_decay=1e-4)
            # self.optim_dual = to.optim.Adam([{'params': self._log_eta}], lr=lr_dual, eps=1e-5)  # used in [2], but unstable here
        else:
            raise pyrado.ValueErr(given=optim_mode, eq_constraint=["scipy", "torch"])