예제 #1
0
    def __init__(self, network, input_shape, output_shape, std_0=1.,
                 use_cuda=False, **params):
        """
        Constructor.

        Args:
            network (object): the network class used to implement the mean regressor;
            input_shape (tuple): the shape of the state space;
            output_shape (tuple): the shape of the action space;
            std_0 (float, 1.): initial standard deviation;
            params (dict): parameters used by the network constructor.
        """
        super().__init__(use_cuda)

        self._action_dim = output_shape[0]

        self._mu = Regressor(TorchApproximator, input_shape, output_shape,
                             network=network, use_cuda=use_cuda, **params)

        log_sigma_init = (torch.ones(self._action_dim) * np.log(std_0)).float()

        if self._use_cuda:
            log_sigma_init = log_sigma_init.cuda()

        self._log_sigma = nn.Parameter(log_sigma_init)
예제 #2
0
def learn(alg, alg_params):
    mdp = LQR.generate(dimensions=1)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    approximator_params = dict(input_dim=mdp.info.observation_space.shape)
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape,
                             params=approximator_params)

    sigma = Regressor(LinearApproximator,
                      input_shape=mdp.info.observation_space.shape,
                      output_shape=mdp.info.action_space.shape,
                      params=approximator_params)

    sigma_weights = 2 * np.ones(sigma.weights_size)
    sigma.set_weights(sigma_weights)

    policy = StateStdGaussianPolicy(approximator, sigma)

    agent = alg(mdp.info, policy, **alg_params)

    core = Core(agent, mdp)

    core.learn(n_episodes=10, n_episodes_per_fit=5)

    return policy
예제 #3
0
    def __init__(self, mdp_info, policy, learning_rate, lambda_coeff,
                 features, approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(LinearApproximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff
        self._q_old = None

        self._add_save_attr(
            _approximator_params='pickle',
            Q='pickle',
            _q_old='pickle',
            _lambda='numpy',
            e='numpy'
        )

        super().__init__(mdp_info, policy, self.Q, learning_rate, features)
예제 #4
0
    def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params):
        """
        Constructor.

        Args:
            network (object): the network class used to implement the mean
                regressor;
            input_shape (tuple): the shape of the state space;
            output_shape (tuple): the shape of the action space;
            beta ((float, Parameter)): the inverse of the temperature distribution. As
                the temperature approaches infinity, the policy becomes more and
                more random. As the temperature approaches 0.0, the policy becomes
                more and more greedy.
            params (dict): parameters used by the network constructor.

        """
        super().__init__(use_cuda)

        self._action_dim = output_shape[0]

        self._logits = Regressor(TorchApproximator, input_shape, output_shape,
                                 network=network, use_cuda=use_cuda, **params)
        self._beta = to_parameter(beta)

        self._add_save_attr(
            _action_dim='primitive',
            _beta='mushroom',
            _logits='mushroom'
        )
예제 #5
0
    def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
                 ent_coeff, max_grad_norm=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            ent_coeff (float, 0): coefficient for the entropy penalty;
            max_grad_norm (float, None): maximum norm for gradient clipping.
                If None, no clipping will be performed, unless specified
                otherwise in actor_optimizer;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._entropy_coeff = ent_coeff

        self._V = Regressor(TorchApproximator, **critic_params)

        if 'clipping' not in actor_optimizer and max_grad_norm is not None:
            actor_optimizer = deepcopy(actor_optimizer)
            clipping_params = dict(max_norm=max_grad_norm, norm_type=2)
            actor_optimizer['clipping'] = dict(
                method=torch.nn.utils.clip_grad_norm_, params=clipping_params)

        super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
예제 #6
0
    def __init__(self,
                 mdp_info,
                 policy,
                 actor_optimizer,
                 critic_params,
                 n_epochs_policy,
                 batch_size,
                 eps_ppo,
                 lam,
                 quiet=True,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            n_epochs_policy (int): number of policy updates for every dataset;
            batch_size (int): size of minibatches for every optimization step
            eps_ppo (float): value for probability ratio clipping;
            lam float(float, 1.): lambda coefficient used by generalized
                advantage estimation;
            quiet (bool, True): if true, the algorithm will print debug
                information;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
            n_epochs=10) if critic_fit_params is None else critic_fit_params

        self._n_epochs_policy = n_epochs_policy
        self._batch_size = batch_size
        self._eps_ppo = eps_ppo

        self._optimizer = actor_optimizer['class'](policy.parameters(),
                                                   **actor_optimizer['params'])

        self._lambda = lam

        self._V = Regressor(TorchApproximator, **critic_params)

        self._quiet = quiet
        self._iter = 1

        self._add_save_attr(_critic_fit_params='pickle',
                            _n_epochs_policy='primitive',
                            _batch_size='primitive',
                            _eps_ppo='primitive',
                            _optimizer='torch',
                            _lambda='primitive',
                            _V='mushroom',
                            _quiet='primitive',
                            _iter='primitive')

        super().__init__(mdp_info, policy, None)
예제 #7
0
    def __init__(self,
                 mdp_info,
                 policy,
                 actor_optimizer,
                 critic_params,
                 n_epochs_policy,
                 batch_size,
                 eps_ppo,
                 lam,
                 ent_coeff=0.0,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            n_epochs_policy ([int, Parameter]): number of policy updates for every dataset;
            batch_size ([int, Parameter]): size of minibatches for every optimization step
            eps_ppo ([float, Parameter]): value for probability ratio clipping;
            lam ([float, Parameter], 1.): lambda coefficient used by generalized
                advantage estimation;
            ent_coeff ([float, Parameter], 1.): coefficient for the entropy regularization term;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
            n_epochs=10) if critic_fit_params is None else critic_fit_params

        self._n_epochs_policy = to_parameter(n_epochs_policy)
        self._batch_size = to_parameter(batch_size)
        self._eps_ppo = to_parameter(eps_ppo)

        self._optimizer = actor_optimizer['class'](policy.parameters(),
                                                   **actor_optimizer['params'])

        self._lambda = to_parameter(lam)
        self._ent_coeff = to_parameter(ent_coeff)

        self._V = Regressor(TorchApproximator, **critic_params)

        self._iter = 1

        self._add_save_attr(_critic_fit_params='pickle',
                            _n_epochs_policy='mushroom',
                            _batch_size='mushroom',
                            _eps_ppo='mushroom',
                            _ent_coeff='mushroom',
                            _optimizer='torch',
                            _lambda='mushroom',
                            _V='mushroom',
                            _iter='primitive')

        super().__init__(mdp_info, policy, None)
예제 #8
0
    def __init__(self,
                 mdp_info,
                 policy,
                 mu,
                 alpha_theta,
                 alpha_omega,
                 alpha_v,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            mu (Regressor): regressor that describe the deterministic policy to be
                learned i.e., the deterministic mapping between state and action.
            alpha_theta ([float, Parameter]): learning rate for policy update;
            alpha_omega ([float, Parameter]): learning rate for the advantage function;
            alpha_v ([float, Parameter]): learning rate for the value function;
            value_function_features (Features, None): features used by the value
                function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = to_parameter(alpha_theta)
        self._alpha_omega = to_parameter(alpha_omega)
        self._alpha_v = to_parameter(alpha_v)

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size, ),
                            output_shape=(1, ))

        self._add_save_attr(_mu='mushroom',
                            _psi='pickle',
                            _alpha_theta='mushroom',
                            _alpha_omega='mushroom',
                            _alpha_v='mushroom',
                            _V='mushroom',
                            _A='mushroom')

        super().__init__(mdp_info, policy, policy_features)
예제 #9
0
파일: trpo.py 프로젝트: yanxg/mushroom-rl
    def __init__(self, mdp_info, policy, critic_params,
                 ent_coeff=0., max_kl=.001, lam=1.,
                 n_epochs_line_search=10, n_epochs_cg=10,
                 cg_damping=1e-2, cg_residual_tol=1e-10, quiet=True,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            critic_params (dict): parameters of the critic approximator to
                build;
            ent_coeff (float, 0): coefficient for the entropy penalty;
            max_kl (float, .001): maximum kl allowed for every policy
                update;
            lam float(float, 1.): lambda coefficient used by generalized
                advantage estimation;
            n_epochs_line_search (int, 10): maximum number of iterations
                of the line search algorithm;
            n_epochs_cg (int, 10): maximum number of iterations of the
                conjugate gradient algorithm;
            cg_damping (float, 1e-2): damping factor for the conjugate
                gradient algorithm;
            cg_residual_tol (float, 1e-10): conjugate gradient residual
                tolerance;
            quiet (bool, True): if true, the algorithm will print debug
                information;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(n_epochs=3) if critic_fit_params is None else critic_fit_params

        self._n_epochs_line_search = n_epochs_line_search
        self._n_epochs_cg = n_epochs_cg
        self._cg_damping = cg_damping
        self._cg_residual_tol = cg_residual_tol

        self._max_kl = max_kl
        self._ent_coeff = ent_coeff

        self._lambda = lam

        self._V = Regressor(TorchApproximator, **critic_params)

        self._iter = 1
        self._quiet = quiet

        self._old_policy = None

        super().__init__(mdp_info, policy, None)
예제 #10
0
    def __init__(self,
                 mdp_info,
                 policy,
                 alpha_theta,
                 alpha_v,
                 lambda_par=.9,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            alpha_theta (Parameter): learning rate for policy update;
            alpha_v (Parameter): learning rate for the value function;
            lambda_par (float, .9): trace decay parameter;
            value_function_features (Features, None): features used by the
                value function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_v = alpha_v

        self._lambda = lambda_par

        super().__init__(mdp_info, policy, policy_features)

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)

        self._add_save_attr(_psi='pickle',
                            _alpha_theta='pickle',
                            _alpha_v='pickle',
                            _lambda='primitive',
                            _V='mushroom',
                            _e_v='numpy',
                            _e_theta='numpy')
예제 #11
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 approximator_params=None,
                 fit_params=None,
                 features=None):
        """
        Constructor.

        Args:
            approximator (object): approximator used by the algorithm and the
                policy.
            approximator_params (dict, None): parameters of the approximator to
                build;
            fit_params (dict, None): parameters of the fitting algorithm of the
                approximator;

        """
        approximator_params = dict() if approximator_params is None else\
            approximator_params
        self._fit_params = dict() if fit_params is None else fit_params

        self.approximator = Regressor(approximator, **approximator_params)
        policy.set_q(self.approximator)

        self._add_save_attr(approximator='mushroom', _fit_params='pickle')

        super().__init__(mdp_info, policy, features)
예제 #12
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff ([float, Parameter]): eligibility trace coefficient.

        """
        approximator_params = dict() if approximator_params is None else \
            approximator_params

        Q = Regressor(approximator, **approximator_params)
        self.e = np.zeros(Q.weights_size)
        self._lambda = to_parameter(lambda_coeff)

        self._add_save_attr(_lambda='primitive', e='numpy')

        super().__init__(mdp_info, policy, Q, learning_rate, features)
예제 #13
0
def test_ornstein_uhlenbeck_policy():
    np.random.seed(88)

    mu = Regressor(LinearApproximator,  input_shape=(5,), output_shape=(2,))
    pi = OrnsteinUhlenbeckPolicy(mu, sigma=np.ones(1) * .2, theta=.15, dt=1e-2)

    w = np.random.randn(pi.weights_size)
    pi.set_weights(w)
    assert np.array_equal(pi.get_weights(), w)

    state = np.random.randn(5)

    action = pi.draw_action(state)
    action_test = np.array([-1.95896171,  1.91292747])
    assert np.allclose(action, action_test)

    pi.reset()
    action = pi.draw_action(state)
    action_test = np.array([-1.94161061,  1.92233358])
    assert np.allclose(action, action_test)

    try:
        pi(state, action)
    except NotImplementedError:
        pass
    else:
        assert False
예제 #14
0
def experiment(n_epochs, n_episodes):
    np.random.seed()

    logger = Logger(COPDAC_Q.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__)

    # MDP
    n_steps = 5000
    mdp = InvertedPendulum(horizon=n_steps)

    # Agent
    n_tilings = 10
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [10, 10],
                             mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    dataset_callback = CollectDataset()
    visualization_callback = Display(agent._V, mu,
                                     mdp.info.observation_space.low,
                                     mdp.info.observation_space.high, phi, phi)
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False)
        J = compute_J(dataset_callback.get(), gamma=1.0)
        dataset_callback.clean()
        visualization_callback()
        logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes)

    logger.info('Press a button to visualize the pendulum...')
    input()
    sigma = 1e-8 * np.eye(1)
    policy.set_sigma(sigma)
    core.evaluate(n_steps=n_steps, render=True)
예제 #15
0
    def __init__(self,
                 mdp_info,
                 policy_class,
                 policy_params,
                 actor_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 replay_memory,
                 tau,
                 optimization_steps,
                 comm,
                 policy_delay=1,
                 critic_fit_params=None):
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau
        self._optimization_steps = optimization_steps
        self._comm = comm
        self._policy_delay = policy_delay
        self._fit_count = 0

        if comm.Get_rank() == 0:
            self._replay_memory = replay_memory

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator, **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)
        self._init_target(self._actor_approximator,
                          self._target_actor_approximator)

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()

        self._add_save_attr(_critic_fit_params='pickle',
                            _batch_size='numpy',
                            _tau='numpy',
                            _policy_delay='numpy',
                            _fit_count='numpy',
                            _replay_memory='pickle',
                            _critic_approximator='pickle',
                            _target_critic_approximator='pickle',
                            _actor_approximator='pickle',
                            _target_actor_approximator='pickle')

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
예제 #16
0
class SARSALambdaContinuous(TD):
    """
    Continuous version of SARSA(lambda) algorithm.

    """
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(approximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff

        self._add_save_attr(_approximator_params='pickle',
                            Q='pickle',
                            _lambda='numpy',
                            e='numpy')

        super().__init__(mdp_info, policy, self.Q, learning_rate, features)

    def _update(self, state, action, reward, next_state, absorbing):
        phi_state = self.phi(state)
        q_current = self.Q.predict(phi_state, action)

        alpha = self.alpha(state, action)

        self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff(
            phi_state, action)

        self.next_action = self.draw_action(next_state)
        phi_next_state = self.phi(next_state)
        q_next = self.Q.predict(phi_next_state,
                                self.next_action) if not absorbing else 0.

        delta = reward + self.mdp_info.gamma * q_next - q_current

        theta = self.Q.get_weights()
        theta += alpha * delta * self.e
        self.Q.set_weights(theta)

    def episode_start(self):
        self.e = np.zeros(self.Q.weights_size)

        super().episode_start()
예제 #17
0
    def __init__(self,
                 mdp_info,
                 policy,
                 approximator,
                 learning_rate,
                 lambda_coeff,
                 features,
                 approximator_params=None):
        """
        Constructor.

        Args:
            lambda_coeff (float): eligibility trace coefficient.

        """
        self._approximator_params = dict() if approximator_params is None else \
            approximator_params

        self.Q = Regressor(approximator, **self._approximator_params)
        self.e = np.zeros(self.Q.weights_size)
        self._lambda = lambda_coeff

        super().__init__(mdp_info, policy, self.Q, learning_rate, features)
예제 #18
0
def test_copdac_q():
    n_steps = 50
    mdp = InvertedPendulum(horizon=n_steps)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed(1)

    # Agent
    n_tilings = 1
    alpha_theta = Parameter(5e-3 / n_tilings)
    alpha_omega = Parameter(0.5 / n_tilings)
    alpha_v = Parameter(0.5 / n_tilings)
    tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low,
                             mdp.info.observation_space.high + 1e-3)

    phi = Features(tilings=tilings)

    input_shape = (phi.size, )

    mu = Regressor(LinearApproximator,
                   input_shape=input_shape,
                   output_shape=mdp.info.action_space.shape)

    sigma = 1e-1 * np.eye(1)
    policy = GaussianPolicy(mu, sigma)

    agent = COPDAC_Q(mdp.info,
                     policy,
                     mu,
                     alpha_theta,
                     alpha_omega,
                     alpha_v,
                     value_function_features=phi,
                     policy_features=phi)

    # Train
    core = Core(agent, mdp)

    core.learn(n_episodes=2, n_episodes_per_fit=1)

    w = agent.policy.get_weights()
    w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2])

    assert np.allclose(w, w_test)
예제 #19
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    logger = Logger(alg.__name__, results_dir=None)
    logger.strong_line()
    logger.info('Experiment Algorithm: ' + alg.__name__)

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks_fit=[dataset_callback])

    for i in trange(n_epochs, leave=False):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        logger.epoch_info(i + 1,
                          J=np.mean(J),
                          mu=p[:n_weights],
                          sigma=p[n_weights:])

    logger.info('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
예제 #20
0
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit):
    np.random.seed()

    # MDP
    mdp = Segway()

    # Policy
    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    n_weights = approximator.weights_size
    mu = np.zeros(n_weights)
    sigma = 2e-0 * np.ones(n_weights)
    policy = DeterministicPolicy(approximator)
    dist = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, dist, policy, **params)

    # Train
    print(alg.__name__)
    dataset_callback = CollectDataset()
    core = Core(agent, mdp, callbacks=[dataset_callback])

    for i in range(n_epochs):
        core.learn(n_episodes=n_episodes,
                   n_episodes_per_fit=n_ep_per_fit,
                   render=False)
        J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma)
        dataset_callback.clean()

        p = dist.get_parameters()

        print('mu:    ', p[:n_weights])
        print('sigma: ', p[n_weights:])
        print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J)))

    print('Press a button to visualize the segway...')
    input()
    core.evaluate(n_episodes=3, render=True)
예제 #21
0
def learn(alg, **alg_params):
    np.random.seed(1)
    torch.manual_seed(1)

    # MDP
    mdp = LQR.generate(dimensions=2)

    approximator = Regressor(LinearApproximator,
                             input_shape=mdp.info.observation_space.shape,
                             output_shape=mdp.info.action_space.shape)

    policy = DeterministicPolicy(mu=approximator)

    mu = np.zeros(policy.weights_size)
    sigma = 1e-3 * np.ones(policy.weights_size)
    distribution = GaussianDiagonalDistribution(mu, sigma)

    agent = alg(mdp.info, distribution, policy, **alg_params)
    core = Core(agent, mdp)

    core.learn(n_episodes=5, n_episodes_per_fit=5)

    return agent
예제 #22
0
class PPO(Agent):
    """
    Proximal Policy Optimization algorithm.
    "Proximal Policy Optimization Algorithms".
    Schulman J. et al.. 2017.

    """
    def __init__(self, mdp_info, policy, actor_optimizer, critic_params,
                 n_epochs_policy, batch_size, eps_ppo, lam, quiet=True,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            n_epochs_policy (int): number of policy updates for every dataset;
            batch_size (int): size of minibatches for every optimization step
            eps_ppo (float): value for probability ratio clipping;
            lam float(float, 1.): lambda coefficient used by generalized
                advantage estimation;
            quiet (bool, True): if true, the algorithm will print debug
                information;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params

        self._n_epochs_policy = n_epochs_policy
        self._batch_size = batch_size
        self._eps_ppo = eps_ppo

        self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params'])

        self._lambda = lam

        self._V = Regressor(TorchApproximator, **critic_params)

        self._quiet = quiet
        self._iter = 1

        self._add_save_attr(
            _critic_fit_params='pickle', 
            _n_epochs_policy='primitive',
            _batch_size='primitive',
            _eps_ppo='primitive',
            _optimizer='torch',
            _lambda='primitive',
            _V='mushroom',
            _quiet='primitive',
            _iter='primitive'
        )

        super().__init__(mdp_info, policy, None)

    def fit(self, dataset):
        if not self._quiet:
            tqdm.write('Iteration ' + str(self._iter))

        x, u, r, xn, absorbing, last = parse_dataset(dataset)
        x = x.astype(np.float32)
        u = u.astype(np.float32)
        r = r.astype(np.float32)
        xn = xn.astype(np.float32)

        obs = to_float_tensor(x, self.policy.use_cuda)
        act = to_float_tensor(u, self.policy.use_cuda)
        v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda)
        np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8)
        adv = to_float_tensor(np_adv, self.policy.use_cuda)

        old_pol_dist = self.policy.distribution_t(obs)
        old_log_p = old_pol_dist.log_prob(act)[:, None].detach()

        self._V.fit(x, v_target, **self._critic_fit_params)

        self._update_policy(obs, act, adv, old_log_p)

        # Print fit information
        self._print_fit_info(dataset, x, v_target, old_pol_dist)
        self._iter += 1

    def _update_policy(self, obs, act, adv, old_log_p):
        for epoch in range(self._n_epochs_policy):
            for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator(
                    self._batch_size, obs, act, adv, old_log_p):
                self._optimizer.zero_grad()
                prob_ratio = torch.exp(
                    self.policy.log_prob_t(obs_i, act_i) - old_log_p_i
                )
                clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo,
                                            1 + self._eps_ppo)
                loss = -torch.mean(torch.min(prob_ratio * adv_i,
                                             clipped_ratio * adv_i))
                loss.backward()
                self._optimizer.step()

    def _print_fit_info(self, dataset, x, v_target, old_pol_dist):
        if not self._quiet:
            logging_verr = []
            torch_v_targets = torch.tensor(v_target, dtype=torch.float)
            for idx in range(len(self._V)):
                v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float)
                v_err = F.mse_loss(v_pred, torch_v_targets)
                logging_verr.append(v_err.item())

            logging_ent = self.policy.entropy(x)
            new_pol_dist = self.policy.distribution(x)
            logging_kl = torch.mean(torch.distributions.kl.kl_divergence(
                new_pol_dist, old_pol_dist))
            avg_rwd = np.mean(compute_J(dataset))
            tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {}  kl {}".format(
                avg_rwd, logging_verr, logging_ent, logging_kl))
            tqdm.write(
                '--------------------------------------------------------------------------------------------------')

    def _post_load(self):
        if self._optimizer is not None:
            update_optimizer_parameters(self._optimizer, list(self.policy.parameters()))
예제 #23
0
파일: ddpg.py 프로젝트: yanxg/mushroom-rl
    def __init__(self,
                 mdp_info,
                 policy_class,
                 policy_params,
                 actor_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 tau,
                 policy_delay=1,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            actor_params (dict): parameters of the actor approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            policy_delay (int, 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau
        self._policy_delay = policy_delay
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator, **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)
        self._init_target(self._actor_approximator,
                          self._target_actor_approximator)

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
예제 #24
0
파일: ddpg.py 프로젝트: yanxg/mushroom-rl
class DDPG(DeepAC):
    """
    Deep Deterministic Policy Gradient algorithm.
    "Continuous Control with Deep Reinforcement Learning".
    Lillicrap T. P. et al.. 2016.

    """
    def __init__(self,
                 mdp_info,
                 policy_class,
                 policy_params,
                 actor_params,
                 actor_optimizer,
                 critic_params,
                 batch_size,
                 initial_replay_size,
                 max_replay_size,
                 tau,
                 policy_delay=1,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy_class (Policy): class of the policy;
            policy_params (dict): parameters of the policy to build;
            actor_params (dict): parameters of the actor approximator to
                build;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            tau (float): value of coefficient for soft updates;
            policy_delay (int, 1): the number of updates of the critic after
                which an actor update is implemented;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator;

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._tau = tau
        self._policy_delay = policy_delay
        self._fit_count = 0

        self._replay_memory = ReplayMemory(initial_replay_size,
                                           max_replay_size)

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        target_actor_params = deepcopy(actor_params)
        self._actor_approximator = Regressor(TorchApproximator, **actor_params)
        self._target_actor_approximator = Regressor(TorchApproximator,
                                                    **target_actor_params)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)
        self._init_target(self._actor_approximator,
                          self._target_actor_approximator)

        policy = policy_class(self._actor_approximator, **policy_params)

        policy_parameters = self._actor_approximator.model.network.parameters()

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ =\
                self._replay_memory.get(self._batch_size)

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self._critic_approximator.fit(state, action, q,
                                          **self._critic_fit_params)

            if self._fit_count % self._policy_delay == 0:
                loss = self._loss(state)
                self._optimize_actor_parameters(loss)

            self._update_target(self._critic_approximator,
                                self._target_critic_approximator)
            self._update_target(self._actor_approximator,
                                self._target_actor_approximator)

            self._fit_count += 1

    def _loss(self, state):
        action = self._actor_approximator(state, output_tensor=True)
        q = self._critic_approximator(state, action, output_tensor=True)

        return -q.mean()

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Action-values returned by the critic for ``next_state`` and the
            action returned by the actor.

        """
        a = self._target_actor_approximator(next_state)

        q = self._target_critic_approximator.predict(next_state, a)
        q *= 1 - absorbing

        return q
예제 #25
0
class A2C(DeepAC):
    """
    Advantage Actor Critic algorithm (A2C).
    Synchronous version of the A3C algorithm.
    "Asynchronous Methods for Deep Reinforcement Learning".
    Mnih V. et. al.. 2016.

    """
    def __init__(self,
                 mdp_info,
                 policy,
                 actor_optimizer,
                 critic_params,
                 ent_coeff,
                 max_grad_norm=None,
                 critic_fit_params=None):
        """
        Constructor.

        Args:
            policy (TorchPolicy): torch policy to be learned by the algorithm;
            actor_optimizer (dict): parameters to specify the actor optimizer
                algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty;
            max_grad_norm (float, None): maximum norm for gradient clipping.
                If None, no clipping will be performed, unless specified
                otherwise in actor_optimizer;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict(
        ) if critic_fit_params is None else critic_fit_params

        self._entropy_coeff = to_parameter(ent_coeff)

        self._V = Regressor(TorchApproximator, **critic_params)

        if 'clipping' not in actor_optimizer and max_grad_norm is not None:
            actor_optimizer = deepcopy(actor_optimizer)
            clipping_params = dict(max_norm=max_grad_norm, norm_type=2)
            actor_optimizer['clipping'] = dict(
                method=torch.nn.utils.clip_grad_norm_, params=clipping_params)

        self._add_save_attr(_critic_fit_params='pickle',
                            _entropy_coeff='mushroom',
                            _V='mushroom')

        super().__init__(mdp_info, policy, actor_optimizer,
                         policy.parameters())

    def fit(self, dataset):
        state, action, reward, next_state, absorbing, _ = parse_dataset(
            dataset)

        v, adv = compute_advantage_montecarlo(self._V, state, next_state,
                                              reward, absorbing,
                                              self.mdp_info.gamma)
        self._V.fit(state, v, **self._critic_fit_params)

        loss = self._loss(state, action, adv)
        self._optimize_actor_parameters(loss)

    def _loss(self, state, action, adv):
        use_cuda = self.policy.use_cuda

        s = to_float_tensor(state, use_cuda)
        a = to_float_tensor(action, use_cuda)

        adv_t = to_float_tensor(adv, use_cuda)

        gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t)
        entropy_loss = -self.policy.entropy_t(s)

        return gradient_loss + self._entropy_coeff() * entropy_loss

    def _post_load(self):
        self._update_optimizer_parameters(self.policy.parameters())
예제 #26
0
    def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
                 actor_optimizer, critic_params, batch_size,
                 initial_replay_size, max_replay_size, warmup_transitions, tau,
                 lr_alpha, target_entropy=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            actor_mu_params (dict): parameters of the actor mean approximator
                to build;
            actor_sigma_params (dict): parameters of the actor sigm
                approximator to build;
            actor_optimizer (dict): parameters to specify the actor
                optimizer algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            warmup_transitions (int): number of samples to accumulate in the
                replay memory to start the policy fitting;
            tau (float): value of coefficient for soft updates;
            lr_alpha (float): Learning rate for the entropy coefficient;
            target_entropy (float, None): target entropy for the policy, if
                None a default value is computed ;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._warmup_transitions = warmup_transitions
        self._tau = tau

        if target_entropy is None:
            self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32)
        else:
            self._target_entropy = target_entropy

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        if 'n_models' in critic_params.keys():
            assert critic_params['n_models'] == 2
        else:
            critic_params['n_models'] = 2

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        actor_mu_approximator = Regressor(TorchApproximator,
                                          **actor_mu_params)
        actor_sigma_approximator = Regressor(TorchApproximator,
                                             **actor_sigma_params)

        policy = SACPolicy(actor_mu_approximator,
                           actor_sigma_approximator,
                           mdp_info.action_space.low,
                           mdp_info.action_space.high)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)

        self._log_alpha = torch.tensor(0., dtype=torch.float32)

        if policy.use_cuda:
            self._log_alpha = self._log_alpha.cuda().requires_grad_()
        else:
            self._log_alpha.requires_grad_()

        self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha)

        policy_parameters = chain(actor_mu_approximator.model.network.parameters(),
                                  actor_sigma_approximator.model.network.parameters())

        self._add_save_attr(
            _critic_fit_params='pickle',
            _batch_size='numpy',
            _warmup_transitions='numpy',
            _tau='numpy',
            _target_entropy='numpy',
            _replay_memory='pickle',
            _critic_approximator='pickle',
            _target_critic_approximator='pickle',
            _log_alpha='pickle',
            _alpha_optim='pickle'
        )

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
예제 #27
0
class SAC(DeepAC):
    """
    Soft Actor-Critic algorithm.
    "Soft Actor-Critic Algorithms and Applications".
    Haarnoja T. et al.. 2019.

    """
    def __init__(self, mdp_info, actor_mu_params, actor_sigma_params,
                 actor_optimizer, critic_params, batch_size,
                 initial_replay_size, max_replay_size, warmup_transitions, tau,
                 lr_alpha, target_entropy=None, critic_fit_params=None):
        """
        Constructor.

        Args:
            actor_mu_params (dict): parameters of the actor mean approximator
                to build;
            actor_sigma_params (dict): parameters of the actor sigm
                approximator to build;
            actor_optimizer (dict): parameters to specify the actor
                optimizer algorithm;
            critic_params (dict): parameters of the critic approximator to
                build;
            batch_size (int): the number of samples in a batch;
            initial_replay_size (int): the number of samples to collect before
                starting the learning;
            max_replay_size (int): the maximum number of samples in the replay
                memory;
            warmup_transitions (int): number of samples to accumulate in the
                replay memory to start the policy fitting;
            tau (float): value of coefficient for soft updates;
            lr_alpha (float): Learning rate for the entropy coefficient;
            target_entropy (float, None): target entropy for the policy, if
                None a default value is computed ;
            critic_fit_params (dict, None): parameters of the fitting algorithm
                of the critic approximator.

        """
        self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params

        self._batch_size = batch_size
        self._warmup_transitions = warmup_transitions
        self._tau = tau

        if target_entropy is None:
            self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32)
        else:
            self._target_entropy = target_entropy

        self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size)

        if 'n_models' in critic_params.keys():
            assert critic_params['n_models'] == 2
        else:
            critic_params['n_models'] = 2

        target_critic_params = deepcopy(critic_params)
        self._critic_approximator = Regressor(TorchApproximator,
                                              **critic_params)
        self._target_critic_approximator = Regressor(TorchApproximator,
                                                     **target_critic_params)

        actor_mu_approximator = Regressor(TorchApproximator,
                                          **actor_mu_params)
        actor_sigma_approximator = Regressor(TorchApproximator,
                                             **actor_sigma_params)

        policy = SACPolicy(actor_mu_approximator,
                           actor_sigma_approximator,
                           mdp_info.action_space.low,
                           mdp_info.action_space.high)

        self._init_target(self._critic_approximator,
                          self._target_critic_approximator)

        self._log_alpha = torch.tensor(0., dtype=torch.float32)

        if policy.use_cuda:
            self._log_alpha = self._log_alpha.cuda().requires_grad_()
        else:
            self._log_alpha.requires_grad_()

        self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha)

        policy_parameters = chain(actor_mu_approximator.model.network.parameters(),
                                  actor_sigma_approximator.model.network.parameters())

        self._add_save_attr(
            _critic_fit_params='pickle',
            _batch_size='numpy',
            _warmup_transitions='numpy',
            _tau='numpy',
            _target_entropy='numpy',
            _replay_memory='pickle',
            _critic_approximator='pickle',
            _target_critic_approximator='pickle',
            _log_alpha='pickle',
            _alpha_optim='pickle'
        )

        super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)

    def fit(self, dataset):
        self._replay_memory.add(dataset)
        if self._replay_memory.initialized:
            state, action, reward, next_state, absorbing, _ = \
                self._replay_memory.get(self._batch_size)

            if self._replay_memory.size > self._warmup_transitions:
                action_new, log_prob = self.policy.compute_action_and_log_prob_t(state)
                loss = self._loss(state, action_new, log_prob)
                self._optimize_actor_parameters(loss)
                self._update_alpha(log_prob.detach())

            q_next = self._next_q(next_state, absorbing)
            q = reward + self.mdp_info.gamma * q_next

            self._critic_approximator.fit(state, action, q,
                                          **self._critic_fit_params)

            self._update_target(self._critic_approximator,
                                self._target_critic_approximator)

    def _loss(self, state, action_new, log_prob):
        q_0 = self._critic_approximator(state, action_new,
                                        output_tensor=True, idx=0)
        q_1 = self._critic_approximator(state, action_new,
                                        output_tensor=True, idx=1)

        q = torch.min(q_0, q_1)

        return (self._alpha * log_prob - q).mean()

    def _update_alpha(self, log_prob):
        alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean()
        self._alpha_optim.zero_grad()
        alpha_loss.backward()
        self._alpha_optim.step()

    def _next_q(self, next_state, absorbing):
        """
        Args:
            next_state (np.ndarray): the states where next action has to be
                evaluated;
            absorbing (np.ndarray): the absorbing flag for the states in
                ``next_state``.

        Returns:
            Action-values returned by the critic for ``next_state`` and the
            action returned by the actor.

        """
        a, log_prob_next = self.policy.compute_action_and_log_prob(next_state)

        q = self._target_critic_approximator.predict(
            next_state, a, prediction='min') - self._alpha_np * log_prob_next
        q *= 1 - absorbing

        return q

    def _post_load(self):
        if self._optimizer is not None:
            self._parameters = list(
                chain(self.policy._mu_approximator.model.network.parameters(),
                      self.policy._sigma_approximator.model.network.parameters()
                )
            )

    @property
    def _alpha(self):
        return self._log_alpha.exp()

    @property
    def _alpha_np(self):
        return self._alpha.detach().cpu().numpy()
예제 #28
0
class COPDAC_Q(Agent):
    """
    Compatible off-policy deterministic actor-critic algorithm.
    "Deterministic Policy Gradient Algorithms".
    Silver D. et al.. 2014.

    """

    def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v,
                 value_function_features=None, policy_features=None):
        """
        Constructor.

        Args:
            mu (Regressor): regressor that describe the deterministic policy to be
                learned i.e., the deterministic mapping between state and action.
            alpha_theta (Parameter): learning rate for policy update;
            alpha_omega (Parameter): learning rate for the advantage function;
            alpha_v (Parameter): learning rate for the value function;
            value_function_features (Features, None): features used by the value
                function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._mu = mu
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_omega = alpha_omega
        self._alpha_v = alpha_v

        if self._psi is not None:
            input_shape = (self._psi.size,)
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator, input_shape=input_shape,
                            output_shape=(1,))

        self._A = Regressor(LinearApproximator,
                            input_shape=(self._mu.weights_size,),
                            output_shape=(1,))

        self._add_save_attr(
            _mu='mushroom',
            _psi='pickle',
            _alpha_theta='pickle',
            _alpha_omega='pickle',
            _alpha_v='pickle',
            _V='mushroom',
            _A='mushroom'
        )

        super().__init__(mdp_info, policy, policy_features)

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            q_next = self._V(ss_psi).item() if not absorbing else 0

            grad_mu_s = np.atleast_2d(self._mu.diff(s_phi))
            omega = self._A.get_weights()

            delta = r + self.mdp_info.gamma * q_next - self._Q(s, a)
            delta_theta = self._alpha_theta(s, a) * \
                          omega.dot(grad_mu_s.T).dot(grad_mu_s)
            delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a)
            delta_v = self._alpha_v(s, a) * delta * s_psi

            theta_new = self._mu.get_weights() + delta_theta
            self._mu.set_weights(theta_new)

            omega_new = omega + delta_omega
            self._A.set_weights(omega_new)

            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

    def _Q(self, state, action):
        state_psi = self._psi(state) if self._psi is not None else state

        return self._V(state_psi).item() + self._A(self._nu(state,
                                                            action)).item()

    def _nu(self, state, action):
        state_phi = self.phi(state) if self.phi is not None else state
        grad_mu = np.atleast_2d(self._mu.diff(state_phi))
        delta = action - self._mu(state_phi)

        return delta.dot(grad_mu)
예제 #29
0
class StochasticAC(Agent):
    """
    Stochastic Actor critic in the episodic setting as presented in:
    "Model-Free Reinforcement Learning with Continuous Action in Practice".
    Degris T. et al.. 2012.

    """
    def __init__(self,
                 mdp_info,
                 policy,
                 alpha_theta,
                 alpha_v,
                 lambda_par=.9,
                 value_function_features=None,
                 policy_features=None):
        """
        Constructor.

        Args:
            alpha_theta (Parameter): learning rate for policy update;
            alpha_v (Parameter): learning rate for the value function;
            lambda_par (float, .9): trace decay parameter;
            value_function_features (Features, None): features used by the
                value function approximator;
            policy_features (Features, None): features used by the policy.

        """
        self._psi = value_function_features

        self._alpha_theta = alpha_theta
        self._alpha_v = alpha_v

        self._lambda = lambda_par

        super().__init__(mdp_info, policy, policy_features)

        if self._psi is not None:
            input_shape = (self._psi.size, )
        else:
            input_shape = mdp_info.observation_space.shape

        self._V = Regressor(LinearApproximator,
                            input_shape=input_shape,
                            output_shape=(1, ))

        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)

        self._add_save_attr(_psi='pickle',
                            _alpha_theta='pickle',
                            _alpha_v='pickle',
                            _lambda='primitive',
                            _V='mushroom',
                            _e_v='numpy',
                            _e_theta='numpy')

    def episode_start(self):
        self._e_v = np.zeros(self._V.weights_size)
        self._e_theta = np.zeros(self.policy.weights_size)

        super().episode_start()

    def fit(self, dataset):
        for step in dataset:
            s, a, r, ss, absorbing, _ = step

            s_phi = self.phi(s) if self.phi is not None else s
            s_psi = self._psi(s) if self._psi is not None else s
            ss_psi = self._psi(ss) if self._psi is not None else ss

            v_next = self._V(ss_psi) if not absorbing else 0

            delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi)

            # Update value function
            delta_v = self._alpha_v(s, a) * delta * self._e_v
            v_new = self._V.get_weights() + delta_v
            self._V.set_weights(v_new)

            # Update policy
            delta_theta = self._alpha_theta(s, a) * delta * self._e_theta
            theta_new = self.policy.get_weights() + delta_theta
            self.policy.set_weights(theta_new)

    def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi):
        # Compute TD error
        delta = r + self.mdp_info.gamma * v_next - self._V(s_psi)

        # Update traces
        self._e_v = self.mdp_info.gamma * self._lambda * self._e_v + s_psi
        self._e_theta = self.mdp_info.gamma * self._lambda * \
            self._e_theta + self.policy.diff_log(s_phi, a)

        return delta
예제 #30
0
import numpy as np
from matplotlib import pyplot as plt

from mushroom_rl.approximators import Regressor
from mushroom_rl.approximators.parametric import LinearApproximator

x = np.arange(10).reshape(-1, 1)

intercept = 10
noise = np.random.randn(10, 1) * 1
y = 2 * x + intercept + noise

phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1)

regressor = Regressor(LinearApproximator,
                      input_shape=(2, ),
                      output_shape=(1, ))

regressor.fit(phi, y)

print('Weights: ' + str(regressor.get_weights()))
print('Gradient: ' + str(regressor.diff(np.array([[5.]]))))

plt.scatter(x, y)
plt.plot(x, regressor.predict(phi))
plt.show()