class DDPG(Agent): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, actor_fit_params=None, critic_fit_params=None): """ Constructor. Args: actor_approximator (object): the approximator to use for the actor; critic_approximator (object): the approximator to use for the critic; policy_class (Policy): class of the policy; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; policy_params (dict): parameters of the policy to build; actor_fit_params (dict, None): parameters of the fitting algorithm of the actor approximator; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._actor_fit_params = dict() if actor_fit_params is None else actor_fit_params self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._actor_approximator.fit(state, state, **self._actor_fit_params) self._update_target() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
import numpy as np from matplotlib import pyplot as plt from mushroom.approximators import Regressor from mushroom.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2,), output_shape=(1,)) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class TRPO(Agent): """ Trust Region Policy optimization algorithm. "Trust Region Policy Optimization". Schulman J. et al.. 2015. """ def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, lam=1., n_epochs_line_search=10, n_epochs_cg=10, cg_damping=1e-2, cg_residual_tol=1e-10, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; ent_coeff (float, 0): coefficient for the entropy penalty; max_kl (float, .001): maximum kl allowed for every policy update; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; n_epochs_line_search (int, 10): maximum number of iterations of the line search algorithm; n_epochs_cg (int, 10): maximum number of iterations of the conjugate gradient algorithm; cg_damping (float, 1e-2): damping factor for the conjugate gradient algorithm; cg_residual_tol (float, 1e-10): conjugate gradient residual tolerance; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( n_epochs=3) if critic_fit_params is None else critic_fit_params self._n_epochs_line_search = n_epochs_line_search self._n_epochs_cg = n_epochs_cg self._cg_damping = cg_damping self._cg_residual_tol = cg_residual_tol self._max_kl = max_kl self._ent_coeff = ent_coeff self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._quiet = quiet self._old_policy = None super().__init__(policy, mdp_info, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset( dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) # Policy update self._old_policy = deepcopy(self.policy) old_pol_dist = self._old_policy.distribution_t(obs) old_log_prob = self._old_policy.log_prob_t(obs, act).detach() zero_grad(self.policy.parameters()) loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward() g = get_gradient(self.policy.parameters()) # Compute direction through conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search self._line_search(obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _fisher_vector_product(self, p, obs, old_pol_dist): p_tensor = torch.from_numpy(p) if self.policy.use_cuda: p_tensor = p_tensor.cuda() return self._fisher_vector_product_t(p_tensor, obs, old_pol_dist) def _fisher_vector_product_t(self, p, obs, old_pol_dist): kl = self._compute_kl(obs, old_pol_dist) grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = torch.sum(flat_grad_kl * p) grads_v = torch.autograd.grad(kl_v, self.policy.parameters(), create_graph=False) flat_grad_grad_kl = torch.cat( [grad.contiguous().view(-1) for grad in grads_v]).data return flat_grad_grad_kl + p * self._cg_damping def _conjugate_gradient(self, b, obs, old_pol_dist): p = b.detach().cpu().numpy() r = b.detach().cpu().numpy() x = np.zeros_like(p) r2 = r.dot(r) for i in range(self._n_epochs_cg): z = self._fisher_vector_product( p, obs, old_pol_dist).detach().cpu().numpy() v = r2 / p.dot(z) x += v * p r -= v * z r2_new = r.dot(r) mu = r2_new / r2 p = r + mu * p r2 = r2_new if r2 < self._cg_residual_tol: break return x def _line_search(self, obs, act, adv, old_log_prob, old_pol_dist, prev_loss, stepdir): # Compute optimal step size direction = self._fisher_vector_product( stepdir, obs, old_pol_dist).detach().cpu().numpy() shs = .5 * stepdir.dot(direction) lm = np.sqrt(shs / self._max_kl) full_step = stepdir / lm stepsize = 1. # Save old policy parameters theta_old = self.policy.get_weights() # Perform Line search violation = True for _ in range(self._n_epochs_line_search): theta_new = theta_old + full_step * stepsize self.policy.set_weights(theta_new) new_loss = self._compute_loss(obs, act, adv, old_log_prob) kl = self._compute_kl(obs, old_pol_dist) improve = new_loss - prev_loss if kl <= self._max_kl * 1.5 or improve >= 0: violation = False break stepsize *= .5 if violation: self.policy.set_weights(theta_old) def _compute_kl(self, obs, old_pol_dist): new_pol_dist = self.policy.distribution_t(obs) return torch.mean( torch.distributions.kl.kl_divergence(old_pol_dist, new_pol_dist)) def _compute_loss(self, obs, act, adv, old_log_prob): ratio = torch.exp(self.policy.log_prob_t(obs, act) - old_log_prob) J = torch.mean(ratio * adv) return J + self._ent_coeff * self.policy.entropy_t(obs) def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean( torch.distributions.kl.kl_divergence(old_pol_dist, new_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write( "Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}" .format(avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------' )
class TRPO(Agent): def __init__(self, mdp_info, policy, critic_params, ent_coeff=0., max_kl=.001, lam=1., n_epochs_line_search=10, n_epochs_cg=10, cg_damping=1e-2, cg_residual_tol=1e-10, quiet=True, critic_fit_params=None): """ Constructor. Args: """ self._critic_fit_params = dict(n_epochs=3) if critic_fit_params is None else critic_fit_params self._n_epochs_line_search = n_epochs_line_search self._n_epochs_cg = n_epochs_cg self._cg_damping = cg_damping self._cg_residual_tol = cg_residual_tol self._max_kl = max_kl self._ent_coeff = ent_coeff self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._iter = 1 self._quiet = quiet super().__init__(policy, mdp_info, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) state, action, reward, next_state, absorbing, last = parse_dataset(dataset) x = state.astype(np.float32) u = action.astype(np.float32) r = reward.astype(np.float32) xn = next_state.astype(np.float32) obs = torch.tensor(x, dtype=torch.float) act = torch.tensor(u, dtype=torch.float) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = torch.tensor(np_adv, dtype=torch.float) # Policy update old_pol_dist = self.policy.distribution_t(obs) old_log_prob = self.policy.log_prob_t(obs, act).detach() self._zero_grad() loss = self._compute_loss(obs, act, adv, old_log_prob) prev_loss = loss.item() # Compute Gradient loss.backward(retain_graph=True) g = get_gradient(self.policy.parameters()) # Compute direction trough conjugate gradient stepdir = self._conjugate_gradient(g, obs, old_pol_dist) # Line search shs = .5 * stepdir.dot(self._fisher_vector_product( torch.from_numpy(stepdir), obs, old_pol_dist) ) lm = np.sqrt(shs / self._max_kl) fullstep = stepdir / lm stepsize = 1. theta_old = self.policy.get_weights() violation = True for _ in range(self._n_epochs_line_search): theta_new = theta_old + fullstep * stepsize self.policy.set_weights(theta_new) new_loss = self._compute_loss(obs, act, adv, old_log_prob) kl = self._compute_kl(obs, old_pol_dist) improve = new_loss - prev_loss if kl <= self._max_kl * 1.5 or improve >= 0: violation = False break stepsize *= .5 if violation: self.policy.set_weights(theta_old) # VF update self._V.fit(x, v_target, **self._critic_fit_params) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _zero_grad(self): zero_grad(self.policy.parameters()) def _conjugate_gradient(self, b, obs, old_pol_dist): p = b.detach().numpy() r = b.detach().numpy() x = np.zeros_like(b) rdotr = r.dot(r) for i in range(self._n_epochs_cg): z = self._fisher_vector_product( torch.from_numpy(p), obs, old_pol_dist).detach().numpy() v = rdotr / p.dot(z) x += v * p r -= v * z newrdotr = r.dot(r) mu = newrdotr / rdotr p = r + mu * p rdotr = newrdotr if rdotr < self._cg_residual_tol: break return x def _fisher_vector_product(self, p, obs, old_pol_dist): self._zero_grad() kl = self._compute_kl(obs, old_pol_dist) grads = torch.autograd.grad(kl, self.policy.parameters(), create_graph=True, retain_graph=True) flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) kl_v = (flat_grad_kl * torch.autograd.Variable(p)).sum() grads = torch.autograd.grad(kl_v, self.policy.parameters(), retain_graph=True) flat_grad_grad_kl = torch.cat( [grad.contiguous().view(-1) for grad in grads]).data return flat_grad_grad_kl + p * self._cg_damping def _compute_kl(self, obs, old_pol_dist): new_pol_dist = self.policy.distribution_t(obs) return torch.mean(torch.distributions.kl.kl_divergence(new_pol_dist, old_pol_dist)) def _compute_loss(self, obs, act, adv, old_log_prob): ratio = torch.exp(self.policy.log_prob_t(obs, act) - old_log_prob) J = torch.mean(ratio * adv) return J + self._ent_coeff * self.policy.entropy_t(obs) def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean( torch.distributions.kl.kl_divergence(new_pol_dist, old_pol_dist) ) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------')
class A2C(DeepAC): """ Advantage Actor Critic algorithm (A2C). Synchronous version of the A3C algorithm. "Asynchronous Methods for Deep Reinforcement Learning". Mnih V. et. al.. 2016. """ def __init__(self, mdp_info, policy, critic_params, actor_optimizer, ent_coeff, max_grad_norm=None, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; ent_coeff (float, 0): coefficient for the entropy penalty; max_grad_norm (float, None): maximum norm for gradient clipping. If None, no clipping will be performed, unless specified otherwise in actor_optimizer; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._entropy_coeff = ent_coeff self._V = Regressor(TorchApproximator, **critic_params) if 'clipping' not in actor_optimizer and max_grad_norm is not None: actor_optimizer = deepcopy(actor_optimizer) clipping_params = dict(max_norm=max_grad_norm, norm_type=2) actor_optimizer['clipping'] = dict( method=torch.nn.utils.clip_grad_norm_, params=clipping_params) super().__init__(policy, mdp_info, actor_optimizer, policy.parameters()) def fit(self, dataset): state, action, reward, next_state, absorbing, _ = parse_dataset( dataset) v, adv = compute_advantage_montecarlo(self._V, state, next_state, reward, absorbing, self.mdp_info.gamma) self._V.fit(state, v, **self._critic_fit_params) loss = self._loss(state, action, adv) self._optimize_actor_parameters(loss) def _loss(self, state, action, adv): use_cuda = self.policy.use_cuda s = to_float_tensor(state, use_cuda) a = to_float_tensor(action, use_cuda) adv_t = to_float_tensor(adv, use_cuda) gradient_loss = -torch.mean(self.policy.log_prob_t(s, a) * adv_t) entropy_loss = -self.policy.entropy_t(s) return gradient_loss + self._entropy_coeff * entropy_loss
class PPO(Agent): def __init__(self, mdp_info, policy, critic_params, actor_optimizer, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 super().__init__(policy, mdp_info, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = torch.tensor(x, dtype=torch.float) act = torch.tensor(u, dtype=torch.float) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = torch.tensor(np_adv, dtype=torch.float) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _update_policy(self, obs, act, adv, old_log_p): for epoch in range(self._n_epochs_policy): for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator( self._batch_size, obs, act, adv, old_log_p): self._optimizer.zero_grad() prob_ratio = torch.exp( self.policy.log_prob_t(obs_i, act_i) - old_log_p_i ) clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo, 1 + self._eps_ppo) loss = -torch.mean(torch.min(prob_ratio * adv_i, clipped_ratio * adv_i)) loss.backward() self._optimizer.step() def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean(torch.distributions.kl.kl_divergence( new_pol_dist, old_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------')
import numpy as np from matplotlib import pyplot as plt from mushroom.approximators import Regressor from mushroom.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2, ), output_shape=(1, )) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class DDPG(ReparametrizationAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, batch_size, initial_replay_size, max_replay_size, tau, critic_params, actor_params, actor_optimizer, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; actor_params (dict): parameters of the actor approximator to build; critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target() policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(policy, mdp_info, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target() self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _init_target(self): """ Init weights for target approximators """ self._target_actor_approximator.set_weights( self._actor_approximator.get_weights()) self._target_critic_approximator.set_weights( self._critic_approximator.get_weights()) def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.get_weights() critic_weights += (1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.get_weights() actor_weights += (1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class SAC(ReparametrizationAC): """ Soft Actor-Critic algorithm. "Soft Actor-Critic Algorithms and Applications". Haarnoja T. et al.. 2019. """ def __init__(self, mdp_info, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, target_entropy=None, critic_fit_params=None): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; warmup_transitions (int): number of samples to accumulate in the replay memory to start the policy fitting; tau (float): value of coefficient for soft updates; lr_alpha (float): Learning rate for the entropy coefficient; actor_mu_params (dict): parameters of the actor mean approximator to build; actor_sigma_params (dict): parameters of the actor sigma approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; target_entropy (float, None): target entropy for the policy, if None a default value is computed ; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict() if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._warmup_transitions = warmup_transitions self._tau = tau if target_entropy is None: self._target_entropy = -np.prod(mdp_info.action_space.shape).astype(np.float32) else: self._target_entropy = target_entropy self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) if 'n_models' in critic_params.keys(): assert critic_params['n_models'] == 2 else: critic_params['n_models'] = 2 if 'prediction' in critic_params.keys(): assert critic_params['prediction'] == 'min' else: critic_params['prediction'] = 'min' target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) self._log_alpha = torch.tensor(0., requires_grad=True, dtype=torch.float32) self._alpha_optim = optim.Adam([self._log_alpha], lr=lr_alpha) actor_mu_approximator = Regressor(TorchApproximator, **actor_mu_params) actor_sigma_approximator = Regressor(TorchApproximator, **actor_sigma_params) policy = SACPolicy(actor_mu_approximator, actor_sigma_approximator, mdp_info.action_space.low, mdp_info.action_space.high) self._init_target() policy_parameters = chain(actor_mu_approximator.model.network.parameters(), actor_sigma_approximator.model.network.parameters()) super().__init__(policy, mdp_info, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._replay_memory.size > self._warmup_transitions: action_new, log_prob = self.policy.compute_action_and_log_prob_t(state) loss = self._loss(state, action_new, log_prob) self._optimize_actor_parameters(loss) self._update_alpha(log_prob.detach()) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) self._update_target() def _init_target(self): """ Init weights for target approximators. """ for i in range(len(self._critic_approximator)): self._target_critic_approximator.model[i].set_weights( self._critic_approximator.model[i].get_weights()) def _loss(self, state, action_new, log_prob): q_0 = self._critic_approximator(state, action_new, output_tensor=True, idx=0) q_1 = self._critic_approximator(state, action_new, output_tensor=True, idx=1) q = torch.min(q_0, q_1) return (self._alpha * log_prob - q).mean() def _update_alpha(self, log_prob): alpha_loss = - (self._log_alpha * (log_prob + self._target_entropy)).mean() self._alpha_optim.zero_grad() alpha_loss.backward() self._alpha_optim.step() def _update_target(self): """ Update the target networks. """ for i in range(len(self._target_critic_approximator)): critic_weights_i = self._tau * self._critic_approximator.model[i].get_weights() critic_weights_i += (1 - self._tau) * self._target_critic_approximator.model[i].get_weights() self._target_critic_approximator.model[i].set_weights(critic_weights_i) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a, log_prob_next = self.policy.compute_action_and_log_prob(next_state) q = self._target_critic_approximator.predict(next_state, a) - self._alpha_np * log_prob_next q *= 1 - absorbing return q @property def _alpha(self): return self._log_alpha.exp() @property def _alpha_np(self): return self._alpha.detach().cpu().numpy()
class PPO(Agent): """ Proximal Policy Optimization algorithm. "Proximal Policy Optimization Algorithms". Schulman J. et al.. 2017. """ def __init__(self, mdp_info, policy, critic_params, actor_optimizer, n_epochs_policy, batch_size, eps_ppo, lam, quiet=True, critic_fit_params=None): """ Constructor. Args: policy (TorchPolicy): torch policy to be learned by the algorithm critic_params (dict): parameters of the critic approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; n_epochs_policy (int): number of policy updates for every dataset; batch_size (int): size of minibatches for every optimization step eps_ppo (float): value for probability ratio clipping; lam float(float, 1.): lambda coefficient used by generalized advantage estimation; quiet (bool, True): if true, the algorithm will print debug information; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator. """ self._critic_fit_params = dict(n_epochs=10) if critic_fit_params is None else critic_fit_params self._n_epochs_policy = n_epochs_policy self._batch_size = batch_size self._eps_ppo = eps_ppo self._optimizer = actor_optimizer['class'](policy.parameters(), **actor_optimizer['params']) self._lambda = lam self._V = Regressor(TorchApproximator, **critic_params) self._quiet = quiet self._iter = 1 super().__init__(policy, mdp_info, None) def fit(self, dataset): if not self._quiet: tqdm.write('Iteration ' + str(self._iter)) x, u, r, xn, absorbing, last = parse_dataset(dataset) x = x.astype(np.float32) u = u.astype(np.float32) r = r.astype(np.float32) xn = xn.astype(np.float32) obs = to_float_tensor(x, self.policy.use_cuda) act = to_float_tensor(u, self.policy.use_cuda) v_target, np_adv = compute_gae(self._V, x, xn, r, absorbing, last, self.mdp_info.gamma, self._lambda) np_adv = (np_adv - np.mean(np_adv)) / (np.std(np_adv) + 1e-8) adv = to_float_tensor(np_adv, self.policy.use_cuda) old_pol_dist = self.policy.distribution_t(obs) old_log_p = old_pol_dist.log_prob(act)[:, None].detach() self._V.fit(x, v_target, **self._critic_fit_params) self._update_policy(obs, act, adv, old_log_p) # Print fit information self._print_fit_info(dataset, x, v_target, old_pol_dist) self._iter += 1 def _update_policy(self, obs, act, adv, old_log_p): for epoch in range(self._n_epochs_policy): for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator( self._batch_size, obs, act, adv, old_log_p): self._optimizer.zero_grad() prob_ratio = torch.exp( self.policy.log_prob_t(obs_i, act_i) - old_log_p_i ) clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo, 1 + self._eps_ppo) loss = -torch.mean(torch.min(prob_ratio * adv_i, clipped_ratio * adv_i)) loss.backward() self._optimizer.step() def _print_fit_info(self, dataset, x, v_target, old_pol_dist): if not self._quiet: logging_verr = [] torch_v_targets = torch.tensor(v_target, dtype=torch.float) for idx in range(len(self._V)): v_pred = torch.tensor(self._V(x, idx=idx), dtype=torch.float) v_err = F.mse_loss(v_pred, torch_v_targets) logging_verr.append(v_err.item()) logging_ent = self.policy.entropy(x) new_pol_dist = self.policy.distribution(x) logging_kl = torch.mean(torch.distributions.kl.kl_divergence( new_pol_dist, old_pol_dist)) avg_rwd = np.mean(compute_J(dataset)) tqdm.write("Iterations Results:\n\trewards {} vf_loss {}\n\tentropy {} kl {}".format( avg_rwd, logging_verr, logging_ent, logging_kl)) tqdm.write( '--------------------------------------------------------------------------------------------------')