def __init__(self, network, input_shape, output_shape, std_0=1., use_cuda=False, **params): """ Constructor. Args: network (object): the network class used to implement the mean regressor; input_shape (tuple): the shape of the state space; output_shape (tuple): the shape of the action space; std_0 (float, 1.): initial standard deviation; params (dict): parameters used by the network constructor. """ super().__init__(use_cuda) self._action_dim = output_shape[0] self._mu = Regressor(TorchApproximator, input_shape, output_shape, network=network, use_cuda=use_cuda, **params) log_sigma_init = (torch.ones(self._action_dim) * np.log(std_0)).float() if self._use_cuda: log_sigma_init = log_sigma_init.cuda() self._log_sigma = nn.Parameter(log_sigma_init)
def learn(alg, alg_params): mdp = LQR.generate(dimensions=1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return policy
def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None self._add_save_attr( _approximator_params='pickle', Q='pickle', _q_old='pickle', _lambda='numpy', e='numpy' ) super().__init__(mdp_info, policy, self.Q, learning_rate, features)
def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params): """ Constructor. Args: network (object): the network class used to implement the mean regressor; input_shape (tuple): the shape of the state space; output_shape (tuple): the shape of the action space; beta ((float, Parameter)): the inverse of the temperature distribution. As the temperature approaches infinity, the policy becomes more and more random. As the temperature approaches 0.0, the policy becomes more and more greedy. params (dict): parameters used by the network constructor. """ super().__init__(use_cuda) self._action_dim = output_shape[0] self._logits = Regressor(TorchApproximator, input_shape, output_shape, network=network, use_cuda=use_cuda, **params) self._beta = to_parameter(beta) self._add_save_attr( _action_dim='primitive', _beta='mushroom', _logits='mushroom' )
def __init__(self, mdp_info, policy, actor_optimizer, critic_params, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; ent_coeff (float, 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._entropy_coeff = ent_coeff self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) super().__init__(mdp_info, policy, actor_optimizer, policy.parameters())
def __init__(self, mdp_info, policy, actor_optimizer, critic_params, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; n_epochs_policy (int): number of policy updates for every dataset; batch_size (int): size of minibatches for every optimization step eps_ppo (float): value for probability ratio clipping; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 self._add_save_attr(_critic_fit_params='pickle', _n_epochs_policy='primitive', _batch_size='primitive', _eps_ppo='primitive', _optimizer='torch', _lambda='primitive', _V='mushroom', _quiet='primitive', _iter='primitive') super().__init__(mdp_info, policy, None)
def __init__(self, mdp_info, policy, actor_optimizer, critic_params, n_epochs_policy, batch_size, eps_ppo, lam, ent_coeff=0.0, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; n_epochs_policy ([int, Parameter]): number of policy updates for every dataset; batch_size ([int, Parameter]): size of minibatches for every optimization step eps_ppo ([float, Parameter]): value for probability ratio clipping; lam ([float, Parameter], 1.): lambda coefficient used by generalized advantage estimation; ent_coeff ([float, Parameter], 1.): coefficient for the entropy regularization term; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = to_parameter(n_epochs_policy) self._batch_size = to_parameter(batch_size) self._eps_ppo = to_parameter(eps_ppo) self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = to_parameter(lam) self._ent_coeff = to_parameter(ent_coeff) self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._add_save_attr(_critic_fit_params='pickle', _n_epochs_policy='mushroom', _batch_size='mushroom', _eps_ppo='mushroom', _ent_coeff='mushroom', _optimizer='torch', _lambda='mushroom', _V='mushroom', _iter='primitive') super().__init__(mdp_info, policy, None)
def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta ([float, Parameter]): learning rate for policy update; alpha_omega ([float, Parameter]): learning rate for the advantage function; alpha_v ([float, Parameter]): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = to_parameter(alpha_theta) self._alpha_omega = to_parameter(alpha_omega) self._alpha_v = to_parameter(alpha_v) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size, ), output_shape=(1, )) self._add_save_attr(_mu='mushroom', _psi='pickle', _alpha_theta='mushroom', _alpha_omega='mushroom', _alpha_v='mushroom', _V='mushroom', _A='mushroom') super().__init__(mdp_info, policy, policy_features)
def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, lam=1., n_epochs_line_search=10, n_epochs_cg=10, cg_damping=1e-2, cg_residual_tol=1e-10, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; ent_coeff (float, 0): coefficient for the entropy penalty; max_kl (float, .001): maximum kl allowed for every policy update; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; n_epochs_line_search (int, 10): maximum number of iterations of the line search algorithm; n_epochs_cg (int, 10): maximum number of iterations of the conjugate gradient algorithm; cg_damping (float, 1e-2): damping factor for the conjugate gradient algorithm; cg_residual_tol (float, 1e-10): conjugate gradient residual tolerance; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict(n_epochs=3) if critic_fit_params is None else critic_fit_params self._n_epochs_line_search = n_epochs_line_search self._n_epochs_cg = n_epochs_cg self._cg_damping = cg_damping self._cg_residual_tol = cg_residual_tol self._max_kl = max_kl self._ent_coeff = ent_coeff self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._quiet = quiet self._old_policy = None super().__init__(mdp_info, policy, None)
def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; lambda_par (float, .9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._lambda = lambda_par super().__init__(mdp_info, policy, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._add_save_attr(_psi='pickle', _alpha_theta='pickle', _alpha_v='pickle', _lambda='primitive', _V='mushroom', _e_v='numpy', _e_theta='numpy')
def __init__(self, mdp_info, policy, approximator, approximator_params=None, fit_params=None, features=None): """ Constructor. Args: approximator (object): approximator used by the algorithm and the policy. approximator_params (dict, None): parameters of the approximator to build; fit_params (dict, None): parameters of the fitting algorithm of the approximator; """ approximator_params = dict() if approximator_params is None else\ approximator_params self._fit_params = dict() if fit_params is None else fit_params self.approximator = Regressor(approximator, **approximator_params) policy.set_q(self.approximator) self._add_save_attr(approximator='mushroom', _fit_params='pickle') super().__init__(mdp_info, policy, features)
def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff ([float, Parameter]): eligibility trace coefficient. """ approximator_params = dict() if approximator_params is None else \ approximator_params Q = Regressor(approximator, **approximator_params) self.e = np.zeros(Q.weights_size) self._lambda = to_parameter(lambda_coeff) self._add_save_attr(_lambda='primitive', e='numpy') super().__init__(mdp_info, policy, Q, learning_rate, features)
def test_ornstein_uhlenbeck_policy(): np.random.seed(88) mu = Regressor(LinearApproximator, input_shape=(5,), output_shape=(2,)) pi = OrnsteinUhlenbeckPolicy(mu, sigma=np.ones(1) * .2, theta=.15, dt=1e-2) w = np.random.randn(pi.weights_size) pi.set_weights(w) assert np.array_equal(pi.get_weights(), w) state = np.random.randn(5) action = pi.draw_action(state) action_test = np.array([-1.95896171, 1.91292747]) assert np.allclose(action, action_test) pi.reset() action = pi.draw_action(state) action_test = np.array([-1.94161061, 1.92233358]) assert np.allclose(action, action_test) try: pi(state, action) except NotImplementedError: pass else: assert False
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, replay_memory, tau, optimization_steps, comm, policy_delay=1, critic_fit_params=None): self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._optimization_steps = optimization_steps self._comm = comm self._policy_delay = policy_delay self._fit_count = 0 if comm.Get_rank() == 0: self._replay_memory = replay_memory target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() self._add_save_attr(_critic_fit_params='pickle', _batch_size='numpy', _tau='numpy', _policy_delay='numpy', _fit_count='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _actor_approximator='pickle', _target_actor_approximator='pickle') super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._add_save_attr(_approximator_params='pickle', Q='pickle', _lambda='numpy', e='numpy') super().__init__(mdp_info, policy, self.Q, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size) super().episode_start()
def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff super().__init__(mdp_info, policy, self.Q, learning_rate, features)
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) logger.info('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def learn(alg, **alg_params): np.random.seed(1) torch.manual_seed(1) # MDP mdp = LQR.generate(dimensions=2) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, distribution, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=5, n_episodes_per_fit=5) return agent
class PPO(Agent): """ Proximal Policy Optimization algorithm. "Proximal Policy Optimization Algorithms". Schulman J. et al.. 2017. """ def __init__(self, mdp_info, policy, actor_optimizer, critic_params, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; n_epochs_policy (int): number of policy updates for every dataset; batch_size (int): size of minibatches for every optimization step eps_ppo (float): value for probability ratio clipping; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 self._add_save_attr( _critic_fit_params='pickle', _n_epochs_policy='primitive', _batch_size='primitive', _eps_ppo='primitive', _optimizer='torch', _lambda='primitive', _V='mushroom', _quiet='primitive', _iter='primitive' ) super().__init__(mdp_info, policy, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _update_policy(self, obs, act, adv, old_log_p): for epoch in range(self._n_epochs_policy): for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator( self._batch_size, obs, act, adv, old_log_p): self._optimizer.zero_grad() prob_ratio = torch.exp( self.policy.log_prob_t(obs_i, act_i) - old_log_p_i ) clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo, 1 + self._eps_ppo) loss = -torch.mean(torch.min(prob_ratio * adv_i, clipped_ratio * adv_i)) loss.backward() self._optimizer.step() def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean(torch.distributions.kl.kl_divergence( new_pol_dist, old_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------') def _post_load(self): if self._optimizer is not None: update_optimizer_parameters(self._optimizer, list(self.policy.parameters()))
def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
class DDPG(DeepAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target(self._critic_approximator, self._target_critic_approximator) self._init_target(self._actor_approximator, self._target_actor_approximator) policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target(self._critic_approximator, self._target_critic_approximator) self._update_target(self._actor_approximator, self._target_actor_approximator) self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class A2C(DeepAC): """ Advantage Actor Critic algorithm (A2C). Synchronous version of the A3C algorithm. "Asynchronous Methods for Deep Reinforcement Learning". Mnih V. et. al.. 2016. """ def __init__(self, mdp_info, policy, actor_optimizer, critic_params, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; ent_coeff ([float, Parameter], 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._entropy_coeff = to_parameter(ent_coeff) self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) self._add_save_attr(_critic_fit_params='pickle', _entropy_coeff='mushroom', _V='mushroom') super().__init__(mdp_info, policy, actor_optimizer, policy.parameters()) def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset( dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss) def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff() * entropy_loss def _post_load(self): self._update_optimizer_parameters(self.policy.parameters())
def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, target_entropy=None, critic_fit_params=None): """ Constructor. Args: actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigm approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high) self._init_target(self._critic_approximator, self._target_critic_approximator) self._log_alpha = torch.tensor(0., dtype=torch.float32) if policy.use_cuda: self._log_alpha = self._log_alpha.cuda().requires_grad_() else: self._log_alpha.requires_grad_() self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) policy_parameters = chain(actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) self._add_save_attr( _critic_fit_params='pickle', _batch_size='numpy', _warmup_transitions='numpy', _tau='numpy', _target_entropy='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _log_alpha='pickle', _alpha_optim='pickle' ) super().__init__(mdp_info, policy, actor_optimizer, policy_parameters)
class SAC(DeepAC): """ Soft Actor-Critic algorithm. "Soft Actor-Critic Algorithms and Applications". Haarnoja T. et al.. 2019. """ def __init__(self, mdp_info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, target_entropy=None, critic_fit_params=None): """ Constructor. Args: actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigm approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high) self._init_target(self._critic_approximator, self._target_critic_approximator) self._log_alpha = torch.tensor(0., dtype=torch.float32) if policy.use_cuda: self._log_alpha = self._log_alpha.cuda().requires_grad_() else: self._log_alpha.requires_grad_() self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) policy_parameters = chain(actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) self._add_save_attr( _critic_fit_params='pickle', _batch_size='numpy', _warmup_transitions='numpy', _tau='numpy', _target_entropy='numpy', _replay_memory='pickle', _critic_approximator='pickle', _target_critic_approximator='pickle', _log_alpha='pickle', _alpha_optim='pickle' ) super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._replay_memory.size > self._warmup_transitions: action_new, log_prob = self.policy.compute_action_and_log_prob_t(state) loss = self._loss(state, action_new, log_prob) self._optimize_actor_parameters(loss) self._update_alpha(log_prob.detach()) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._update_target(self._critic_approximator, self._target_critic_approximator) def _loss(self, state, action_new, log_prob): q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0) q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1) q = torch.min(q_0, q_1) return (self._alpha * log_prob - q).mean() def _update_alpha(self, log_prob): alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a, log_prob_next = self.policy.compute_action_and_log_prob(next_state) q = self._target_critic_approximator.predict( next_state, a, prediction='min') - self._alpha_np * log_prob_next q *= 1 - absorbing return q def _post_load(self): if self._optimizer is not None: self._parameters = list( chain(self.policy._mu_approximator.model.network.parameters(), self.policy._sigma_approximator.model.network.parameters() ) ) @property def _alpha(self): return self._log_alpha.exp() @property def _alpha_np(self): return self._alpha.detach().cpu().numpy()
class COPDAC_Q(Agent): """ Compatible off-policy deterministic actor-critic algorithm. "Deterministic Policy Gradient Algorithms". Silver D. et al.. 2014. """ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta (Parameter): learning rate for policy update; alpha_omega (Parameter): learning rate for the advantage function; alpha_v (Parameter): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size,), output_shape=(1,)) self._add_save_attr( _mu='mushroom', _psi='pickle', _alpha_theta='pickle', _alpha_omega='pickle', _alpha_v='pickle', _V='mushroom', _A='mushroom' ) super().__init__(mdp_info, policy, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = self._V(ss_psi).item() if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return self._V(state_psi).item() + self._A(self._nu(state, action)).item() def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
class StochasticAC(Agent): """ Stochastic Actor critic in the episodic setting as presented in: "Model-Free Reinforcement Learning with Continuous Action in Practice". Degris T. et al.. 2012. """ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; lambda_par (float, .9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._lambda = lambda_par super().__init__(mdp_info, policy, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._add_save_attr(_psi='pickle', _alpha_theta='pickle', _alpha_v='pickle', _lambda='primitive', _V='mushroom', _e_v='numpy', _e_theta='numpy') def episode_start(self): self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) super().episode_start() def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss v_next = self._V(ss_psi) if not absorbing else 0 delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi) # Update value function delta_v = self._alpha_v(s, a) * delta * self._e_v v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # Update policy delta_theta = self._alpha_theta(s, a) * delta * self._e_theta theta_new = self.policy.get_weights() + delta_theta self.policy.set_weights(theta_new) def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi): # Compute TD error delta = r + self.mdp_info.gamma * v_next - self._V(s_psi) # Update traces self._e_v = self.mdp_info.gamma * self._lambda * self._e_v + s_psi self._e_theta = self.mdp_info.gamma * self._lambda * \ self._e_theta + self.policy.diff_log(s_phi, a) return delta
import numpy as np from matplotlib import pyplot as plt from mushroom_rl.approximators import Regressor from mushroom_rl.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2, ), output_shape=(1, )) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()