class SARSALambdaContinuous(TD): """ Continuous version of SARSA(lambda) algorithm. """ def __init__(self, mdp_info, policy, approximator, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(approximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._add_save_attr(_approximator_params='pickle', Q='pickle', _lambda='numpy', e='numpy') super().__init__(mdp_info, policy, self.Q, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) q_current = self.Q.predict(phi_state, action) alpha = self.alpha(state, action) self.e = self.mdp_info.gamma * self._lambda * self.e + self.Q.diff( phi_state, action) self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - q_current theta = self.Q.get_weights() theta += alpha * delta * self.e self.Q.set_weights(theta) def episode_start(self): self.e = np.zeros(self.Q.weights_size) super().episode_start()
class COPDAC_Q(Agent): """ Compatible off-policy deterministic actor-critic algorithm. "Deterministic Policy Gradient Algorithms". Silver D. et al.. 2014. """ def __init__(self, mdp_info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=None, policy_features=None): """ Constructor. Args: mu (Regressor): regressor that describe the deterministic policy to be learned i.e., the deterministic mapping between state and action. alpha_theta (Parameter): learning rate for policy update; alpha_omega (Parameter): learning rate for the advantage function; alpha_v (Parameter): learning rate for the value function; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._mu = mu self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_omega = alpha_omega self._alpha_v = alpha_v if self._psi is not None: input_shape = (self._psi.size,) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1,)) self._A = Regressor(LinearApproximator, input_shape=(self._mu.weights_size,), output_shape=(1,)) self._add_save_attr( _mu='mushroom', _psi='pickle', _alpha_theta='pickle', _alpha_omega='pickle', _alpha_v='pickle', _V='mushroom', _A='mushroom' ) super().__init__(mdp_info, policy, policy_features) def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss q_next = self._V(ss_psi).item() if not absorbing else 0 grad_mu_s = np.atleast_2d(self._mu.diff(s_phi)) omega = self._A.get_weights() delta = r + self.mdp_info.gamma * q_next - self._Q(s, a) delta_theta = self._alpha_theta(s, a) * \ omega.dot(grad_mu_s.T).dot(grad_mu_s) delta_omega = self._alpha_omega(s, a) * delta * self._nu(s, a) delta_v = self._alpha_v(s, a) * delta * s_psi theta_new = self._mu.get_weights() + delta_theta self._mu.set_weights(theta_new) omega_new = omega + delta_omega self._A.set_weights(omega_new) v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) def _Q(self, state, action): state_psi = self._psi(state) if self._psi is not None else state return self._V(state_psi).item() + self._A(self._nu(state, action)).item() def _nu(self, state, action): state_phi = self.phi(state) if self.phi is not None else state grad_mu = np.atleast_2d(self._mu.diff(state_phi)) delta = action - self._mu(state_phi) return delta.dot(grad_mu)
class StochasticAC(Agent): """ Stochastic Actor critic in the episodic setting as presented in: "Model-Free Reinforcement Learning with Continuous Action in Practice". Degris T. et al.. 2012. """ def __init__(self, mdp_info, policy, alpha_theta, alpha_v, lambda_par=.9, value_function_features=None, policy_features=None): """ Constructor. Args: alpha_theta (Parameter): learning rate for policy update; alpha_v (Parameter): learning rate for the value function; lambda_par (float, .9): trace decay parameter; value_function_features (Features, None): features used by the value function approximator; policy_features (Features, None): features used by the policy. """ self._psi = value_function_features self._alpha_theta = alpha_theta self._alpha_v = alpha_v self._lambda = lambda_par super().__init__(mdp_info, policy, policy_features) if self._psi is not None: input_shape = (self._psi.size, ) else: input_shape = mdp_info.observation_space.shape self._V = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) self._add_save_attr(_psi='pickle', _alpha_theta='pickle', _alpha_v='pickle', _lambda='primitive', _V='mushroom', _e_v='numpy', _e_theta='numpy') def episode_start(self): self._e_v = np.zeros(self._V.weights_size) self._e_theta = np.zeros(self.policy.weights_size) super().episode_start() def fit(self, dataset): for step in dataset: s, a, r, ss, absorbing, _ = step s_phi = self.phi(s) if self.phi is not None else s s_psi = self._psi(s) if self._psi is not None else s ss_psi = self._psi(ss) if self._psi is not None else ss v_next = self._V(ss_psi) if not absorbing else 0 delta = self._compute_td_n_traces(a, r, v_next, s_psi, s_phi) # Update value function delta_v = self._alpha_v(s, a) * delta * self._e_v v_new = self._V.get_weights() + delta_v self._V.set_weights(v_new) # Update policy delta_theta = self._alpha_theta(s, a) * delta * self._e_theta theta_new = self.policy.get_weights() + delta_theta self.policy.set_weights(theta_new) def _compute_td_n_traces(self, a, r, v_next, s_psi, s_phi): # Compute TD error delta = r + self.mdp_info.gamma * v_next - self._V(s_psi) # Update traces self._e_v = self.mdp_info.gamma * self._lambda * self._e_v + s_psi self._e_theta = self.mdp_info.gamma * self._lambda * \ self._e_theta + self.policy.diff_log(s_phi, a) return delta
import numpy as np from matplotlib import pyplot as plt from mushroom_rl.approximators import Regressor from mushroom_rl.approximators.parametric import LinearApproximator x = np.arange(10).reshape(-1, 1) intercept = 10 noise = np.random.randn(10, 1) * 1 y = 2 * x + intercept + noise phi = np.concatenate((np.ones(10).reshape(-1, 1), x), axis=1) regressor = Regressor(LinearApproximator, input_shape=(2, ), output_shape=(1, )) regressor.fit(phi, y) print('Weights: ' + str(regressor.get_weights())) print('Gradient: ' + str(regressor.diff(np.array([[5.]])))) plt.scatter(x, y) plt.plot(x, regressor.predict(phi)) plt.show()
class GaussianTorchPolicy(TorchPolicy): """ Torch policy implementing a Gaussian policy with trainable standard deviation. The standard deviation is not state-dependent. """ def __init__(self, network, input_shape, output_shape, std_0=1., use_cuda=False, **params): """ Constructor. Args: network (object): the network class used to implement the mean regressor; input_shape (tuple): the shape of the state space; output_shape (tuple): the shape of the action space; std_0 (float, 1.): initial standard deviation; params (dict): parameters used by the network constructor. """ super().__init__(use_cuda) self._action_dim = output_shape[0] self._mu = Regressor(TorchApproximator, input_shape, output_shape, network=network, use_cuda=use_cuda, **params) log_sigma_init = (torch.ones(self._action_dim) * np.log(std_0)).float() if self._use_cuda: log_sigma_init = log_sigma_init.cuda() self._log_sigma = nn.Parameter(log_sigma_init) def draw_action_t(self, state): return self.distribution_t(state).sample().detach() def log_prob_t(self, state, action): return self.distribution_t(state).log_prob(action)[:, None] def entropy_t(self, state=None): return self._action_dim / 2 * np.log(2 * np.pi * np.e) + torch.sum(self._log_sigma) def distribution_t(self, state): mu, sigma = self.get_mean_and_covariance(state) return torch.distributions.MultivariateNormal(loc=mu, covariance_matrix=sigma) def get_mean_and_covariance(self, state): return self._mu(state, output_tensor=True), torch.diag(torch.exp(2 * self._log_sigma)) def set_weights(self, weights): log_sigma_data = torch.from_numpy(weights[-self._action_dim:]) if self.use_cuda: log_sigma_data = log_sigma_data.cuda() self._log_sigma.data = log_sigma_data self._mu.set_weights(weights[:-self._action_dim]) def get_weights(self): mu_weights = self._mu.get_weights() sigma_weights = self._log_sigma.data.detach().cpu().numpy() return np.concatenate([mu_weights, sigma_weights]) def parameters(self): return chain(self._mu.model.network.parameters(), [self._log_sigma])
class DDPG(Agent): def __init__(self, actor_approximator, critic_approximator, policy_class, mdp_info, batch_size, initial_replay_size, max_replay_size, tau, actor_params, critic_params, policy_params, n_actions_per_head, history_length=1, n_input_per_mdp=None, n_games=1, dtype=np.uint8): self._batch_size = batch_size self._n_games = n_games if n_input_per_mdp is None: self._n_input_per_mdp = [ mdp_info.observation_space.shape for _ in range(self._n_games) ] else: self._n_input_per_mdp = n_input_per_mdp self._n_actions_per_head = n_actions_per_head self._max_actions = max(n_actions_per_head)[0] self._history_length = history_length self._tau = tau self._replay_memory = [ ReplayMemory(initial_replay_size, max_replay_size) for _ in range(self._n_games) ] self._n_updates = 0 target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(critic_approximator, **critic_params) self._target_critic_approximator = Regressor(critic_approximator, **target_critic_params) if 'loss' not in actor_params: actor_params['loss'] = ActorLoss(self._critic_approximator) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(actor_approximator, n_fit_targets=2, **actor_params) self._target_actor_approximator = Regressor(actor_approximator, n_fit_targets=2, **target_actor_params) self._target_actor_approximator.model.set_weights( self._actor_approximator.model.get_weights()) self._target_critic_approximator.model.set_weights( self._critic_approximator.model.get_weights()) policy = policy_class(self._actor_approximator, **policy_params) super().__init__(mdp_info, policy) n_samples = self._batch_size * self._n_games self._state_idxs = np.zeros(n_samples, dtype=np.int) self._state = np.zeros(((n_samples, self._history_length) + self.mdp_info.observation_space.shape), dtype=dtype).squeeze() self._action = np.zeros((n_samples, self._max_actions)) self._reward = np.zeros(n_samples) self._next_state_idxs = np.zeros(n_samples, dtype=np.int) self._next_state = np.zeros(((n_samples, self._history_length) + self.mdp_info.observation_space.shape), dtype=dtype).squeeze() self._absorbing = np.zeros(n_samples) def fit(self, dataset): s = np.array([d[0][0] for d in dataset]).ravel() games = np.unique(s) for g in games: idxs = np.argwhere(s == g).ravel() d = list() for idx in idxs: d.append(dataset[idx]) self._replay_memory[g].add(d) fit_condition = np.all([rm.initialized for rm in self._replay_memory]) if fit_condition: for i in range(len(self._replay_memory)): game_state, game_action, game_reward, game_next_state,\ game_absorbing, _ = self._replay_memory[i].get( self._batch_size) start = self._batch_size * i stop = start + self._batch_size self._state_idxs[start:stop] = np.ones(self._batch_size) * i self._state[ start:stop, :self._n_input_per_mdp[i][0]] = game_state self._action[ start:stop, :self._n_actions_per_head[i][0]] = game_action self._reward[start:stop] = game_reward self._next_state_idxs[start:stop] = np.ones( self._batch_size) * i self._next_state[ start:stop, :self._n_input_per_mdp[i][0]] = game_next_state self._absorbing[start:stop] = game_absorbing q_next = self._next_q() q = self._reward + q_next self._critic_approximator.fit(self._state, self._action, q, idx=self._state_idxs) self._actor_approximator.fit(self._state, self._state, self._state_idxs, idx=self._state_idxs) self._n_updates += 1 self._update_target() def get_shared_weights(self): cw = self._critic_approximator.model.network.get_shared_weights() aw = self._actor_approximator.model.network.get_shared_weights() return [cw, aw] def set_shared_weights(self, weights): self._critic_approximator.model.network.set_shared_weights(weights[0]) self._actor_approximator.model.network.set_shared_weights(weights[1]) def freeze_shared_weights(self): self._critic_approximator.model.network.freeze_shared_weights() self._actor_approximator.model.network.freeze_shared_weights() def unfreeze_shared_weights(self): self._critic_approximator.model.network.unfreeze_shared_weights() self._actor_approximator.model.network.unfreeze_shared_weights() def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.model.get_weights( ) critic_weights += ( 1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.model.get_weights( ) actor_weights += ( 1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self): a = self._target_actor_approximator(self._next_state, idx=self._next_state_idxs) q = self._target_critic_approximator( self._next_state, a, idx=self._next_state_idxs).ravel() out_q = np.zeros(self._batch_size * self._n_games) for i in range(self._n_games): start = self._batch_size * i stop = start + self._batch_size out_q[start:stop] = q[start:stop] * self.mdp_info.gamma[i] if np.any(self._absorbing[start:stop]): out_q[start:stop] = out_q[start:stop] * ( 1 - self._absorbing[start:stop]) return out_q
class DDPG(DeepAC): """ Deep Deterministic Policy Gradient algorithm. "Continuous Control with Deep Reinforcement Learning". Lillicrap T. P. et al.. 2016. """ def __init__(self, mdp_info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau, policy_delay=1, critic_fit_params=None): """ Constructor. Args: policy_class (Policy): class of the policy; policy_params (dict): parameters of the policy to build; actor_params (dict): parameters of the actor approximator to build; actor_optimizer (dict): parameters to specify the actor optimizer algorithm; critic_params (dict): parameters of the critic approximator to build; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; tau (float): value of coefficient for soft updates; policy_delay (int, 1): the number of updates of the critic after which an actor update is implemented; critic_fit_params (dict, None): parameters of the fitting algorithm of the critic approximator; """ self._critic_fit_params = dict( ) if critic_fit_params is None else critic_fit_params self._batch_size = batch_size self._tau = tau self._policy_delay = policy_delay self._fit_count = 0 self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) target_critic_params = deepcopy(critic_params) self._critic_approximator = Regressor(TorchApproximator, **critic_params) self._target_critic_approximator = Regressor(TorchApproximator, **target_critic_params) target_actor_params = deepcopy(actor_params) self._actor_approximator = Regressor(TorchApproximator, **actor_params) self._target_actor_approximator = Regressor(TorchApproximator, **target_actor_params) self._init_target() policy = policy_class(self._actor_approximator, **policy_params) policy_parameters = self._actor_approximator.model.network.parameters() super().__init__(mdp_info, policy, actor_optimizer, policy_parameters) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self._critic_approximator.fit(state, action, q, **self._critic_fit_params) if self._fit_count % self._policy_delay == 0: loss = self._loss(state) self._optimize_actor_parameters(loss) self._update_target() self._fit_count += 1 def _loss(self, state): action = self._actor_approximator(state, output_tensor=True) q = self._critic_approximator(state, action, output_tensor=True) return -q.mean() def _init_target(self): """ Init weights for target approximators """ self._target_actor_approximator.set_weights( self._actor_approximator.get_weights()) self._target_critic_approximator.set_weights( self._critic_approximator.get_weights()) def _update_target(self): """ Update the target networks. """ critic_weights = self._tau * self._critic_approximator.get_weights() critic_weights += ( 1 - self._tau) * self._target_critic_approximator.get_weights() self._target_critic_approximator.set_weights(critic_weights) actor_weights = self._tau * self._actor_approximator.get_weights() actor_weights += ( 1 - self._tau) * self._target_actor_approximator.get_weights() self._target_actor_approximator.set_weights(actor_weights) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Action-values returned by the critic for ``next_state`` and the action returned by the actor. """ a = self._target_actor_approximator(next_state) q = self._target_critic_approximator.predict(next_state, a) q *= 1 - absorbing return q
class TrueOnlineSARSALambda(TD): """ True Online SARSA(lambda) with linear function approximation. "True Online TD(lambda)". Seijen H. V. et al.. 2014. """ def __init__(self, mdp_info, policy, learning_rate, lambda_coeff, features, approximator_params=None): """ Constructor. Args: lambda_coeff (float): eligibility trace coefficient. """ self._approximator_params = dict() if approximator_params is None else \ approximator_params self.Q = Regressor(LinearApproximator, **self._approximator_params) self.e = np.zeros(self.Q.weights_size) self._lambda = lambda_coeff self._q_old = None self._add_save_attr( _approximator_params='pickle', Q='pickle', _q_old='pickle', _lambda='numpy', e='numpy' ) super().__init__(mdp_info, policy, self.Q, learning_rate, features) def _update(self, state, action, reward, next_state, absorbing): phi_state = self.phi(state) phi_state_action = get_action_features(phi_state, action, self.mdp_info.action_space.n) q_current = self.Q.predict(phi_state, action) if self._q_old is None: self._q_old = q_current alpha = self.alpha(state, action) e_phi = self.e.dot(phi_state_action) self.e = self.mdp_info.gamma * self._lambda * self.e + alpha * ( 1. - self.mdp_info.gamma * self._lambda * e_phi) * phi_state_action self.next_action = self.draw_action(next_state) phi_next_state = self.phi(next_state) q_next = self.Q.predict(phi_next_state, self.next_action) if not absorbing else 0. delta = reward + self.mdp_info.gamma * q_next - self._q_old theta = self.Q.get_weights() theta += delta * self.e + alpha * ( self._q_old - q_current) * phi_state_action self.Q.set_weights(theta) self._q_old = q_next def episode_start(self): self._q_old = None self.e = np.zeros(self.Q.weights_size) super().episode_start()
class BoltzmannTorchPolicy(TorchPolicy): """ Torch policy implementing a Boltzmann policy. """ def __init__(self, network, input_shape, output_shape, beta, use_cuda=False, **params): """ Constructor. Args: network (object): the network class used to implement the mean regressor; input_shape (tuple): the shape of the state space; output_shape (tuple): the shape of the action space; beta ((float, Parameter)): the inverse of the temperature distribution. As the temperature approaches infinity, the policy becomes more and more random. As the temperature approaches 0.0, the policy becomes more and more greedy. params (dict): parameters used by the network constructor. """ super().__init__(use_cuda) self._action_dim = output_shape[0] self._logits = Regressor(TorchApproximator, input_shape, output_shape, network=network, use_cuda=use_cuda, **params) self._beta = to_parameter(beta) self._add_save_attr( _action_dim='primitive', _beta='mushroom', _logits='mushroom' ) def draw_action_t(self, state): action = self.distribution_t(state).sample().detach() if len(action.shape) > 1: return action else: return action.unsqueeze(0) def log_prob_t(self, state, action): return self.distribution_t(state).log_prob(action.squeeze())[:, None] def entropy_t(self, state): return torch.mean(self.distribution_t(state).entropy()) def distribution_t(self, state): logits = self._logits(state, output_tensor=True) * self._beta(state.numpy()) return torch.distributions.Categorical(logits=logits) def set_weights(self, weights): self._logits.set_weights(weights) def get_weights(self): return self._logits.get_weights() def parameters(self): return self._logits.model.network.parameters() def set_beta(self, beta): self._beta = to_parameter(beta)