Exemplo n.º 1
0
    def __init__(self,
                 mdp_info,
                 distribution,
                 policy,
                 eps,
                 kappa,
                 features=None):
        """
        Constructor.

        Args:
            eps ([float, Parameter]): the maximum admissible value for the Kullback-Leibler
                divergence between the new distribution and the
                previous one at each update step.
            kappa ([float, Parameter]): the maximum admissible value for the entropy decrease
                between the new distribution and the 
                previous one at each update step. 

        """
        self._eps = to_parameter(eps)
        self._kappa = to_parameter(kappa)

        self._add_save_attr(_eps='mushroom')
        self._add_save_attr(_kappa='mushroom')

        super().__init__(mdp_info, distribution, policy, features)
Exemplo n.º 2
0
    def __init__(self,
                 mdp_info,
                 policy,
                 actor_optimizer,
                 critic_params,
                 n_epochs_policy,
                 batch_size,
                 eps_ppo,
                 lam,
                 ent_coeff=0.0,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            n_epochs_policy ([int, Parameter]): number of policy updates for every dataset;
            batch_size ([int, Parameter]): size of minibatches for every optimization step
            eps_ppo ([float, Parameter]): value for probability ratio clipping;
            lam ([float, Parameter], 1.): lambda coefficient used by generalized
                advantage estimation;
            ent_coeff ([float, Parameter], 1.): coefficient for the entropy regularization term;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
            n_epochs=10) if critic_fit_params is None else critic_fit_params

        self._n_epochs_policy = to_parameter(n_epochs_policy)
        self._batch_size = to_parameter(batch_size)
        self._eps_ppo = to_parameter(eps_ppo)

        self._optimizer = actor_optimizer['class'](policy.parameters(),
                                                   **actor_optimizer['params'])

        self._lambda = to_parameter(lam)
        self._ent_coeff = to_parameter(ent_coeff)

        self._V = Regressor(TorchApproximator, **critic_params)

        self._iter = 1

        self._add_save_attr(_critic_fit_params='pickle',
                            _n_epochs_policy='mushroom',
                            _batch_size='mushroom',
                            _eps_ppo='mushroom',
                            _ent_coeff='mushroom',
                            _optimizer='torch',
                            _lambda='mushroom',
                            _V='mushroom',
                            _iter='primitive')

        super().__init__(mdp_info, policy, None)
Exemplo n.º 3
0
    def __init__(self,
                 mdp_info,
                 policy_class,
                 policy_params,
                 actor_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 tau,
                 policy_delay=2,
                 noise_std=.2,
                 noise_clip=.5,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            actor_params (dict): parameters of the actor approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor
                optimizer algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size ((int, Parameter)): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau ((float, Parameter)): value of coefficient for soft updates;
            policy_delay ((int, Parameter), 2): the number of updates of the critic after
                which an actor update is implemented;
            noise_std ((float, Parameter), .2): standard deviation of the noise used for
                policy smoothing;
            noise_clip ((float, Parameter), .5): maximum absolute value for policy smoothing
                noise;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._noise_std = to_parameter(noise_std)
        self._noise_clip = to_parameter(noise_clip)

        if 'n_models' in critic_params.keys():
            assert (critic_params['n_models'] >= 2)
        else:
            critic_params['n_models'] = 2

        self._add_save_attr(_noise_std='mushroom', _noise_clip='mushroom')

        super().__init__(mdp_info, policy_class, policy_params, actor_params,
                         actor_optimizer, critic_params, batch_size,
                         initial_replay_size, max_replay_size, tau,
                         policy_delay, critic_fit_params)
Exemplo n.º 4
0
    def __init__(self,
                 mdp_info,
                 policy,
                 mu,
                 alpha_theta,
                 alpha_omega,
                 alpha_v,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            mu (Regressor): regressor that describe the deterministic policy to be
                learned i.e., the deterministic mapping between state and action.
            alpha_theta ([float, Parameter]): learning rate for policy update;
            alpha_omega ([float, Parameter]): learning rate for the advantage function;
            alpha_v ([float, Parameter]): learning rate for the value function;
            value_function_features (Features, None): features used by the value
                function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = to_parameter(alpha_theta)
        self._alpha_omega = to_parameter(alpha_omega)
        self._alpha_v = to_parameter(alpha_v)

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size, ),
                            output_shape=(1, ))

        self._add_save_attr(_mu='mushroom',
                            _psi='pickle',
                            _alpha_theta='mushroom',
                            _alpha_omega='mushroom',
                            _alpha_v='mushroom',
                            _V='mushroom',
                            _A='mushroom')

        super().__init__(mdp_info, policy, policy_features)
Exemplo n.º 5
0
    def __init__(self,
                 mdp_info,
                 policy,
                 alpha_theta,
                 alpha_v,
                 lambda_par=.9,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            alpha_theta ([float, Parameter]): learning rate for policy update;
            alpha_v ([float, Parameter]): learning rate for the value function;
            lambda_par ([float, Parameter], .9): trace decay parameter;
            value_function_features (Features, None): features used by the
                value function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._psi = value_function_features

        self._alpha_theta = to_parameter(alpha_theta)
        self._alpha_v = to_parameter(alpha_v)

        self._lambda = to_parameter(lambda_par)

        super().__init__(mdp_info, policy, policy_features)

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)

        self._add_save_attr(_psi='pickle',
                            _alpha_theta='mushroom',
                            _alpha_v='mushroom',
                            _lambda='mushroom',
                            _V='mushroom',
                            _e_v='numpy',
                            _e_theta='numpy')
Exemplo n.º 6
0
    def __init__(self, initial_size, max_size, alpha, beta, epsilon=.01):
        """
        Constructor.

        Args:
            initial_size (int): initial number of elements in the replay
                memory;
            max_size (int): maximum number of elements that the replay memory
                can contain;
            alpha (float): prioritization coefficient;
            beta ([float, Parameter]): importance sampling coefficient;
            epsilon (float, .01): small value to avoid zero probabilities.

        """
        self._initial_size = initial_size
        self._max_size = max_size
        self._alpha = alpha
        self._beta = to_parameter(beta)
        self._epsilon = epsilon

        self._tree = SumTree(max_size)

        self._add_save_attr(_initial_size='primitive',
                            _max_size='primitive',
                            _alpha='primitive',
                            _beta='primitive',
                            _epsilon='primitive',
                            _tree='pickle!')
Exemplo n.º 7
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff ([float, Parameter]): eligibility trace coefficient.

        """
        approximator_params = dict() if approximator_params is None else \
            approximator_params

        Q = Regressor(approximator, **approximator_params)
        self.e = np.zeros(Q.weights_size)
        self._lambda = to_parameter(lambda_coeff)

        self._add_save_attr(_lambda='primitive', e='numpy')

        super().__init__(mdp_info, policy, Q, learning_rate, features)
Exemplo n.º 8
0
    def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params):
        """
        Constructor.

        Args:
            network (object): the network class used to implement the mean
                regressor;
            input_shape (tuple): the shape of the state space;
            output_shape (tuple): the shape of the action space;
            beta ((float, Parameter)): the inverse of the temperature distribution. As
                the temperature approaches infinity, the policy becomes more and
                more random. As the temperature approaches 0.0, the policy becomes
                more and more greedy.
            params (dict): parameters used by the network constructor.

        """
        super().__init__(use_cuda)

        self._action_dim = output_shape[0]

        self._logits = Regressor(TorchApproximator, input_shape, output_shape,
                                 network=network, use_cuda=use_cuda, **params)
        self._beta = to_parameter(beta)

        self._add_save_attr(
            _action_dim='primitive',
            _beta='mushroom',
            _logits='mushroom'
        )
Exemplo n.º 9
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 n_iterations,
                 approximator_params=None,
                 fit_params=None,
                 quiet=False):
        """
        Constructor.

        Args:
            n_iterations ((int, Parameter)): number of iterations to perform for training;
            quiet (bool, False): whether to show the progress bar or not.

        """
        self._n_iterations = to_parameter(n_iterations)
        self._quiet = quiet
        self._target = None

        self._add_save_attr(_n_iterations='mushroom',
                            _quiet='primitive',
                            _target='pickle')

        super().__init__(mdp_info, policy, approximator, approximator_params,
                         fit_params)
Exemplo n.º 10
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator_params=None,
                 epsilon=1e-2,
                 fit_params=None,
                 features=None):
        """
        Constructor.

        Args:
            epsilon ((float, Parameter), 1e-2): termination coefficient.

        """
        self._epsilon = to_parameter(epsilon)

        k = features.size * mdp_info.action_space.n
        self._A = np.zeros((k, k))
        self._b = np.zeros((k, 1))

        self._add_save_attr(_epsilon='mushroom',
                            _A='primitive',
                            _b='primitive')

        super().__init__(mdp_info, policy, LinearApproximator,
                         approximator_params, fit_params, features)
Exemplo n.º 11
0
    def set_beta(self, beta):
        """
        Setter.

        Args:
            beta ((float, Parameter)): the inverse of the temperature distribution.

        """
        self._beta = to_parameter(beta)
Exemplo n.º 12
0
    def set_epsilon(self, epsilon):
        """
        Setter.

        Args:
            epsilon ([float, Parameter]): the exploration coefficient. It indicates the
            probability of performing a random actions in the current step.

        """
        self._epsilon = to_parameter(epsilon)
Exemplo n.º 13
0
    def __init__(self, mu_approximator, sigma_approximator, min_a, max_a,
                 log_std_min, log_std_max):
        """
        Constructor.

        Args:
            mu_approximator (Regressor): a regressor computing mean in given a
                state;
            sigma_approximator (Regressor): a regressor computing the variance
                in given a state;
            min_a (np.ndarray): a vector specifying the minimum action value
                for each component;
            max_a (np.ndarray): a vector specifying the maximum action value
                for each component.
            log_std_min ([float, Parameter]): min value for the policy log std;
            log_std_max ([float, Parameter]): max value for the policy log std.

        """
        self._mu_approximator = mu_approximator
        self._sigma_approximator = sigma_approximator

        self._delta_a = to_float_tensor(.5 * (max_a - min_a), self.use_cuda)
        self._central_a = to_float_tensor(.5 * (max_a + min_a), self.use_cuda)

        self._log_std_min = to_parameter(log_std_min)
        self._log_std_max = to_parameter(log_std_max)

        self._eps_log_prob = 1e-6

        use_cuda = self._mu_approximator.model.use_cuda

        if use_cuda:
            self._delta_a = self._delta_a.cuda()
            self._central_a = self._central_a.cuda()

        self._add_save_attr(_mu_approximator='mushroom',
                            _sigma_approximator='mushroom',
                            _delta_a='torch',
                            _central_a='torch',
                            _log_std_min='mushroom',
                            _log_std_max='mushroom',
                            _eps_log_prob='primitive')
Exemplo n.º 14
0
    def __init__(self,
                 mdp_info,
                 policy,
                 learning_rate,
                 off_policy=False,
                 beta=None,
                 delta=None):
        """
        Constructor.

        Args:
            off_policy (bool, False): whether to use the off policy setting or
                the online one;
            beta ([float, Parameter], None): beta coefficient;
            delta ([float, Parameter], None): delta coefficient.

        """
        self.off_policy = off_policy
        if delta is not None and beta is None:
            self.delta = to_parameter(delta)
            self.beta = None
        elif delta is None and beta is not None:
            self.delta = None
            self.beta = to_parameter(beta)
        else:
            raise ValueError('delta or beta parameters needed.')

        Q = Table(mdp_info.size)
        self.Q_tilde = Table(mdp_info.size)
        self.R_tilde = Table(mdp_info.size)

        self._add_save_attr(off_policy='primitive',
                            delta='mushroom',
                            beta='mushroom',
                            Q_tilde='mushroom',
                            R_tilde='mushroom')

        super().__init__(mdp_info, policy, Q, learning_rate)
Exemplo n.º 15
0
    def __init__(self, mdp_info, distribution, policy, beta, features=None):
        """
        Constructor.

        Args:
            beta ([float, Parameter]): the temperature for the exponential reward
                transformation.

        """
        self._beta = to_parameter(beta)

        self._add_save_attr(_beta='mushroom')

        super().__init__(mdp_info, distribution, policy, features)
Exemplo n.º 16
0
    def __init__(self, mdp_info, policy, learning_rate, beta):
        """
        Constructor.

        Args:
            beta ((float, Parameter)): beta coefficient.

        """
        Q = Table(mdp_info.size)
        self._rho = 0.
        self._beta = to_parameter(beta)

        self._add_save_attr(_rho='primitive', _beta='mushroom')

        super().__init__(mdp_info, policy, Q, learning_rate)
Exemplo n.º 17
0
    def __init__(self, epsilon):
        """
        Constructor.

        Args:
            epsilon ([float, Parameter]): the exploration coefficient. It indicates
                the probability of performing a random actions in the current
                step.

        """
        super().__init__()

        self._epsilon = to_parameter(epsilon)

        self._add_save_attr(_epsilon='mushroom')
Exemplo n.º 18
0
    def __init__(self, beta):
        """
        Constructor.

        Args:
            beta ([float, Parameter]): the inverse of the temperature distribution. As
            the temperature approaches infinity, the policy becomes more and
            more random. As the temperature approaches 0.0, the policy becomes
            more and more greedy.

        """
        super().__init__()
        self._beta = to_parameter(beta)

        self._add_save_attr(_beta='mushroom')
Exemplo n.º 19
0
    def __init__(self, mdp_info, policy, alpha_theta, alpha_v, alpha_r,
                 lambda_par=.9, value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            alpha_r (Parameter): learning rate for the reward trace.

        """
        super().__init__(mdp_info, policy, alpha_theta, alpha_v, lambda_par,
                         value_function_features, policy_features)

        self._alpha_r = to_parameter(alpha_r)
        self._r_bar = 0

        self._add_save_attr(_alpha_r='mushroom', _r_bar='primitive')
Exemplo n.º 20
0
    def __init__(self,
                 mdp_info,
                 policy,
                 actor_optimizer,
                 critic_params,
                 ent_coeff,
                 max_grad_norm=None,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty;
            max_grad_norm (float, None): maximum norm for gradient clipping.
                If None, no clipping will be performed, unless specified
                otherwise in actor_optimizer;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._entropy_coeff = to_parameter(ent_coeff)

        self._V = Regressor(TorchApproximator, **critic_params)

        if 'clipping' not in actor_optimizer and max_grad_norm is not None:
            actor_optimizer = deepcopy(actor_optimizer)
            clipping_params = dict(max_norm=max_grad_norm, norm_type=2)
            actor_optimizer['clipping'] = dict(
                method=torch.nn.utils.clip_grad_norm_, params=clipping_params)

        self._add_save_attr(_critic_fit_params='pickle',
                            _entropy_coeff='mushroom',
                            _V='mushroom')

        super().__init__(mdp_info, policy, actor_optimizer,
                         policy.parameters())
Exemplo n.º 21
0
    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
                 trace='replacing'):
        """
        Constructor.

        Args:
            lambda_coeff ((float, Parameter)): eligibility trace coefficient;
            trace (str, 'replacing'): type of eligibility trace to use.

        """
        Q = Table(mdp_info.size)
        self._lambda = to_parameter(lambda_coeff)

        self.e = EligibilityTrace(Q.shape, trace)
        self._add_save_attr(
            _lambda='mushroom',
            e='mushroom'
        )

        super().__init__(mdp_info, policy, Q, learning_rate)
Exemplo n.º 22
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator_params=None,
                 epsilon=1e-2,
                 fit_params=None,
                 features=None):
        """
        Constructor.

        Args:
            epsilon ([float, Parameter], 1e-2): termination coefficient.

        """
        self._epsilon = to_parameter(epsilon)

        self._add_save_attr(_epsilon='mushroom')

        super().__init__(mdp_info, policy, LinearApproximator,
                         approximator_params, fit_params, features)
Exemplo n.º 23
0
 def set_beta(self, beta):
     self._beta = to_parameter(beta)
Exemplo n.º 24
0
    def __init__(self,
                 mdp_info,
                 policy_class,
                 policy_params,
                 actor_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 tau,
                 policy_delay=1,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            actor_params (dict): parameters of the actor approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size ((int, Parameter)): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau ((float, Parameter)): value of coefficient for soft updates;
            policy_delay ((int, Parameter), 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._batch_size = to_parameter(batch_size)
        self._tau = to_parameter(tau)
        self._policy_delay = to_parameter(policy_delay)
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator, **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)
        self._init_target(self._actor_approximator,
                          self._target_actor_approximator)

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()

        self._add_save_attr(_critic_fit_params='pickle',
                            _batch_size='mushroom',
                            _tau='mushroom',
                            _policy_delay='mushroom',
                            _fit_count='primitive',
                            _replay_memory='mushroom',
                            _critic_approximator='mushroom',
                            _target_critic_approximator='mushroom',
                            _target_actor_approximator='mushroom')

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
Exemplo n.º 25
0
    def __init__(self,
                 mdp_info,
                 actor_mu_params,
                 actor_sigma_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 warmup_transitions,
                 tau,
                 lr_alpha,
                 log_std_min=-20,
                 log_std_max=2,
                 target_entropy=None,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            actor_mu_params (dict): parameters of the actor mean approximator
                to build;
            actor_sigma_params (dict): parameters of the actor sigm
                approximator to build;
            actor_optimizer (dict): parameters to specify the actor
                optimizer algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size ((int, Parameter)): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            warmup_transitions ([int, Parameter]): number of samples to accumulate in the
                replay memory to start the policy fitting;
            tau ([float, Parameter]): value of coefficient for soft updates;
            lr_alpha ([float, Parameter]): Learning rate for the entropy coefficient;
            log_std_min ([float, Parameter]): Min value for the policy log std;
            log_std_max ([float, Parameter]): Max value for the policy log std;
            target_entropy (float, None): target entropy for the policy, if
                None a default value is computed ;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._batch_size = to_parameter(batch_size)
        self._warmup_transitions = to_parameter(warmup_transitions)
        self._tau = to_parameter(tau)

        if target_entropy is None:
            self._target_entropy = -np.prod(
                mdp_info.action_space.shape).astype(np.float32)
        else:
            self._target_entropy = target_entropy

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        if 'n_models' in critic_params.keys():
            assert critic_params['n_models'] == 2
        else:
            critic_params['n_models'] = 2

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params)
        actor_sigma_approximator = Regressor(TorchApproximator,
                                             **actor_sigma_params)

        policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator,
                           mdp_info.action_space.low,
                           mdp_info.action_space.high, log_std_min,
                           log_std_max)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)

        self._log_alpha = torch.tensor(0., dtype=torch.float32)

        if policy.use_cuda:
            self._log_alpha = self._log_alpha.cuda().requires_grad_()
        else:
            self._log_alpha.requires_grad_()

        self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha)

        policy_parameters = chain(
            actor_mu_approximator.model.network.parameters(),
            actor_sigma_approximator.model.network.parameters())

        self._add_save_attr(_critic_fit_params='pickle',
                            _batch_size='mushroom',
                            _warmup_transitions='mushroom',
                            _tau='mushroom',
                            _target_entropy='primitive',
                            _replay_memory='mushroom',
                            _critic_approximator='mushroom',
                            _target_critic_approximator='mushroom',
                            _log_alpha='torch',
                            _alpha_optim='torch')

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
Exemplo n.º 26
0
    def __init__(self, mdp_info, policy, approximator, approximator_params,
                 batch_size, target_update_frequency,
                 replay_memory=None, initial_replay_size=500,
                 max_replay_size=5000, fit_params=None, clip_reward=False):
        """
        Constructor.

        Args:
            approximator (object): the approximator to use to fit the
               Q-function;
            approximator_params (dict): parameters of the approximator to
                build;
            batch_size ((int, Parameter)): the number of samples in a batch;
            target_update_frequency (int): the number of samples collected
                between each update of the target network;
            replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the
                object of the replay memory to use; if None, a default replay
                memory is created;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;
            clip_reward (bool, False): whether to clip the reward or not.

        """
        self._fit_params = dict() if fit_params is None else fit_params

        self._batch_size = to_parameter(batch_size)
        self._clip_reward = clip_reward
        self._target_update_frequency = target_update_frequency

        if replay_memory is not None:
            self._replay_memory = replay_memory
            if isinstance(replay_memory, PrioritizedReplayMemory):
                self._fit = self._fit_prioritized
            else:
                self._fit = self._fit_standard
        else:
            self._replay_memory = ReplayMemory(initial_replay_size,
                                               max_replay_size)
            self._fit = self._fit_standard

        self._n_updates = 0

        apprx_params_train = deepcopy(approximator_params)
        apprx_params_target = deepcopy(approximator_params)

        self._initialize_regressors(approximator, apprx_params_train,
                                    apprx_params_target)
        policy.set_q(self.approximator)

        self._add_save_attr(
            _fit_params='pickle',
            _batch_size='mushroom',
            _n_approximators='primitive',
            _clip_reward='primitive',
            _target_update_frequency='primitive',
            _replay_memory='mushroom',
            _n_updates='primitive',
            approximator='mushroom',
            target_approximator='mushroom'
        )

        super().__init__(mdp_info, policy)
Exemplo n.º 27
0
    def __init__(self,
                 mdp_info,
                 policy,
                 critic_params,
                 ent_coeff=0.,
                 max_kl=.001,
                 lam=1.,
                 n_epochs_line_search=10,
                 n_epochs_cg=10,
                 cg_damping=1e-2,
                 cg_residual_tol=1e-10,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            critic_params (dict): parameters of the critic approximator to
                build;
            ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty;
            max_kl ([float, Parameter], .001): maximum kl allowed for every policy
                update;
            lam float([float, Parameter], 1.): lambda coefficient used by generalized
                advantage estimation;
            n_epochs_line_search ([int, Parameter], 10): maximum number of iterations
                of the line search algorithm;
            n_epochs_cg ([int, Parameter], 10): maximum number of iterations of the
                conjugate gradient algorithm;
            cg_damping ([float, Parameter], 1e-2): damping factor for the conjugate
                gradient algorithm;
            cg_residual_tol ([float, Parameter], 1e-10): conjugate gradient residual
                tolerance;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
            n_epochs=5) if critic_fit_params is None else critic_fit_params

        self._n_epochs_line_search = to_parameter(n_epochs_line_search)
        self._n_epochs_cg = to_parameter(n_epochs_cg)
        self._cg_damping = to_parameter(cg_damping)
        self._cg_residual_tol = to_parameter(cg_residual_tol)

        self._max_kl = to_parameter(max_kl)
        self._ent_coeff = to_parameter(ent_coeff)

        self._lambda = to_parameter(lam)

        self._V = Regressor(TorchApproximator, **critic_params)

        self._iter = 1

        self._old_policy = None

        self._add_save_attr(_critic_fit_params='pickle',
                            _n_epochs_line_search='mushroom',
                            _n_epochs_cg='mushroom',
                            _cg_damping='mushroom',
                            _cg_residual_tol='mushroom',
                            _max_kl='mushroom',
                            _ent_coeff='mushroom',
                            _lambda='mushroom',
                            _V='mushroom',
                            _old_policy='mushroom',
                            _iter='primitive')

        super().__init__(mdp_info, policy, None)