def __init__(self, state_size, action_size, actor_lr, critic_lr, random_seed, mu, theta, sigma, buffer_size, batch_size, epsilon_start, epsilon_min, epsilon_decay, gamma, tau, n_time_steps, n_learn_updates, device): self.state_size = state_size self.action_size = action_size self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, name="Actor_local") self.actor_target = Actor(state_size, action_size, name="Actor_target") self.actor_optimizer = Adam(learning_rate=self.actor_lr) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, name="Critic_local") self.critic_target = Critic(state_size, action_size, name="Critic_target") self.critic_optimizer = Adam(learning_rate=self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(action_size, random_seed, mu, theta, sigma) self.epsilon = epsilon_start self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay # Replay memory self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, random_seed) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.n_time_steps = n_time_steps # number of time steps before updating network parameters self.n_learn_updates = n_learn_updates # number of updates per learning step # Device self.device = device tf.keras.backend.clear_session()
def __init__(self, agent_id, model, action_size=2, seed=42, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0.0): """Initialize parameters and build single DDPG Agent. Params ====== agent_id (int): ID of the agent model (object): model object action_size (int): dimension of each action seed (int): random seed tau (float): param for soft update of target parameters lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic weight_decay (float): L2 weight decay """ random.seed(seed) self.id = agent_id self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Set weights for local and target actor, respectively, same for the critic self.hard_copy_init(self.actor_target, self.actor_local) self.hard_copy_init(self.critic_target, self.critic_local) self.noise = OUNoise(action_size, seed)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.critic_lr = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 #0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf self.score = 0
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.critic_lr = 0.001 # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.critic_lr) self.critic_target = Critic(self.state_size, self.action_size, self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 #0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf self.score = 0 def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 if done: self.score = self.total_reward / float(self.count) if self.count else 0.0 # Calculate the running average reward if self.score > self.best_score: self.best_score = self.score # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, state_size, action_size, actor_lr, critic_lr, random_seed, mu, theta, sigma, buffer_size, batch_size, gamma, tau, n_time_steps, n_learn_updates, device): self.state_size = state_size self.action_size = action_size self.action_high = action_high self.action_low = action_low self.actor_lr = actor_lr self.critic_lr = critic_lr # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, name="Actor_local") self.actor_target = Actor(state_size, action_size, name="Actor_target") self.actor_optimizer = Adam(learning_rate=self.actor_lr) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, name="Critic_local") self.critic_target = Critic(state_size, action_size, name="Critic_target") self.critic_optimizer = Adam(learning_rate=self.critic_lr) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise(self.action_size, random_seed, mu, theta, sigma) # Replay memory self.batch_size = int(batch_size) self.buffer_size = int(buffer_size) self.memory = ReplayBuffer(self.buffer_size, self.batch_size, random_seed) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters self.n_time_steps = n_time_steps # number of time steps before updating network parameters self.n_learn_updates = n_learn_updates # number of updates per learning step # Device self.device = device def reset(self): """Reset the agent.""" self.noise.reset() def step(self, time_step, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state[:], action[:], reward, next_state[:], done) if time_step % self.n_time_steps != 0: return # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: # Train the network for a number of epochs specified by the parameter for i in range(self.n_learn_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = np.expand_dims(state, axis=0) action = self._act_tf(tf.constant(state)) action = action.numpy()[0] if add_noise: action += self.noise.sample() action = action.clip(-1, 1) return action @tf.function def _act_tf(self, state): return self.actor_local.model(state) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences : tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ self._learn_tf(experiences, tf.constant(self.gamma, dtype=tf.float64)) @tf.function def _learn_tf(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with tf.GradientTape() as tape: # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model(next_states) Q_targets_next = self.critic_target.model( [next_states, actions_next]) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local.model([states, actions]) critic_loss = MSE(Q_expected, Q_targets) # Minimize the loss critic_grad = tape.gradient( critic_loss, self.critic_local.model.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_grad, self.critic_local.model.trainable_variables)) # ---------------------------- update actor ---------------------------- # with tf.GradientTape() as tape: # Compute actor loss actions_pred = self.actor_local.model(states) actor_loss = -tf.reduce_mean( self.critic_local.model([states, actions_pred])) # Minimize the loss actor_grad = tape.gradient(actor_loss, self.actor_local.model.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_grad, self.actor_local.model.trainable_variables)) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local.model, self.critic_target.model, self.tau) self.soft_update(self.actor_local.model, self.actor_target.model, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: TF2 model target_model: TF2 model tau (float): interpolation parameter """ for target_var, local_var in zip(target_model.weights, local_model.weights): target_var.assign(tau * local_var + (1.0 - tau) * target_var)
class DDPGAgent(): """Single DDPG Agent with basic functionality.""" def __init__(self, agent_id, model, action_size=2, seed=42, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0.0): """Initialize parameters and build single DDPG Agent. Params ====== agent_id (int): ID of the agent model (object): model object action_size (int): dimension of each action seed (int): random seed tau (float): param for soft update of target parameters lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic weight_decay (float): L2 weight decay """ random.seed(seed) self.id = agent_id self.action_size = action_size self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Set weights for local and target actor, respectively, same for the critic self.hard_copy_init(self.actor_target, self.actor_local) self.hard_copy_init(self.critic_target, self.critic_local) self.noise = OUNoise(action_size, seed) def act(self, state, noise_weight=1.0, add_noise=True): """Return actions for given state as per current policy. Params ====== state (array): current state per agent noise_weight (float): decay coefficient for action noise add_noise (bool): flag to add noise to actions """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: self.noise_val = self.noise.sample() * noise_weight action += self.noise_val return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, agent_id, experiences, gamma, all_next_actions, all_actions): """Update policy and value parameters using given batch of experience tuples. Params ====== agent_id (int): ID of an agent experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor all_next_actions (list): next action per each agent, calculated by its actor all_actions (list): action per each agent, calculated by its actor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # get predicted next-state actions and Q values from target models self.critic_optimizer.zero_grad() agent_id = torch.tensor([agent_id]).to(device) actions_next = torch.cat(all_next_actions, dim=1).to(device) with torch.no_grad(): q_targets_next = self.critic_target(next_states, actions_next) # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network q_targets = rewards.index_select( 1, agent_id) + (gamma * q_targets_next * (1 - dones.index_select(1, agent_id))) # compute Q targets for current states (y_i) q_expected = self.critic_local(states, actions) # compute critic loss critic_loss = F.mse_loss(q_expected, q_targets.detach()) # minimize loss critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # compute actor loss self.actor_optimizer.zero_grad() # detach actions from other agents actions_pred = [ actions if i == self.id else actions.detach() for i, actions in enumerate(all_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy_init(self, target, source): """ Init network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)