def __init__(self, state_size, action_size, num_agents, agents=None, new_hyperparameters=None, seed=0, device="cpu", model_output_dir=None, enable_logger=False, logger_path=None, logger_comment=None, opt_soft_update=False): """Initialize a MADDPGAgent wrapper. Args: state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): the number of agents in the environment """ raise NotImplementedError() super(DDPG, self).__init__( new_hyperparameters=new_hyperparameters, enable_logger=enable_logger, logger_path=logger_path, logger_comment=logger_comment ) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.time_step = 0 if agents: self.agents = agents else: self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.model_output_dir = model_output_dir
class ReplayBufferTest(unittest.TestCase): def setUp(self): self.batch_size = 2 self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu") self.populate_replay_buffer() def populate_replay_buffer(self, n=5): for _ in range(n): self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0) def test_add(self): l1 = len(self.replay_buffer) self.replay_buffer.add(0.0, 0.0, 0.0, 0.0, 0.0) l2 = len(self.replay_buffer) self.assertNotEqual(l1, l2) def test_sample(self): s, a, r, ns, d = self.replay_buffer.sample() self.assertEqual(s.shape[0], self.batch_size) self.assertEqual(a.shape[0], self.batch_size) self.assertEqual(r.shape[0], self.batch_size) self.assertEqual(ns.shape[0], self.batch_size) self.assertEqual(d.shape[0], self.batch_size)
def __init__(self, state_size: int, action_size: int, qnetwork_local=None, qnetwork_target=None, optimizer=None, new_hyperparameters=None, seed: int = 0, device: str = "cpu", model_output_dir: str = None, opt_soft_update: bool = False, opt_ddqn: bool = False): """Initialize an DQNAgent object. Args: state_size (int): Dimension of each state. action_size (int): Dimension of each action. qnetwork_local (torch.nn.Module): Local Q-Network model. qnetwork_target (torch.nn.Module): Target Q-Network model. optimizer (torch.optim): Local Q-Network optimizer. new_hyperparameters (dict): New hyperparameter values. seed (int): Random seed. device (str): Identifier for device to be used by PyTorch. model_output_dir (str): Directory where state dicts will be saved to. opt_soft_update (bool): Use soft update instead of hard update. opt_ddqn (bool): Use Double DQN for `expected_Q`. Returns: An instance of DQNAgent. """ super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters) self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.time_step = 0 if qnetwork_local: self.qnetwork_local = qnetwork_local else: self.qnetwork_local = QNetwork(state_size, action_size).to(self.device) if qnetwork_target: self.qnetwork_target = qnetwork_target else: self.qnetwork_target = QNetwork(state_size, action_size).to(self.device) if optimizer: self.optimizer = optimizer else: self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LEARNING_RATE) # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.opt_ddqn = opt_ddqn self.model_output_dir = model_output_dir self.state_dicts = [ (self.qnetwork_local, "qnetwork_local_params"), (self.optimizer, "optimizer_params"), ] # Ensure local and target networks have the same initial weight hard_update(self.qnetwork_local, self.qnetwork_target)
class DQNAgent(Agent): """DQN Agent implementation.""" # TODO: Consider how to extend this to accept multiple agents? # TODO: Add noise to DQN? # TODO: Ensure that this cannot be changed in other ways # TODO: Look up original value for these params REQUIRED_HYPERPARAMETERS = { "buffer_size": int(1e7), "batch_size": 32, "gamma": 0.99, "learning_rate": 2.5e-4, "tau": 1e-3, "learn_every": 4, "hard_update_every": 10000 } ALGORITHM = "DQN" def __init__(self, state_size: int, action_size: int, qnetwork_local=None, qnetwork_target=None, optimizer=None, new_hyperparameters=None, seed: int = 0, device: str = "cpu", model_output_dir: str = None, opt_soft_update: bool = False, opt_ddqn: bool = False): """Initialize an DQNAgent object. Args: state_size (int): Dimension of each state. action_size (int): Dimension of each action. qnetwork_local (torch.nn.Module): Local Q-Network model. qnetwork_target (torch.nn.Module): Target Q-Network model. optimizer (torch.optim): Local Q-Network optimizer. new_hyperparameters (dict): New hyperparameter values. seed (int): Random seed. device (str): Identifier for device to be used by PyTorch. model_output_dir (str): Directory where state dicts will be saved to. opt_soft_update (bool): Use soft update instead of hard update. opt_ddqn (bool): Use Double DQN for `expected_Q`. Returns: An instance of DQNAgent. """ super(DQNAgent, self).__init__(new_hyperparameters=new_hyperparameters) self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.time_step = 0 if qnetwork_local: self.qnetwork_local = qnetwork_local else: self.qnetwork_local = QNetwork(state_size, action_size).to(self.device) if qnetwork_target: self.qnetwork_target = qnetwork_target else: self.qnetwork_target = QNetwork(state_size, action_size).to(self.device) if optimizer: self.optimizer = optimizer else: self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.LEARNING_RATE) # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.opt_ddqn = opt_ddqn self.model_output_dir = model_output_dir self.state_dicts = [ (self.qnetwork_local, "qnetwork_local_params"), (self.optimizer, "optimizer_params"), ] # Ensure local and target networks have the same initial weight hard_update(self.qnetwork_local, self.qnetwork_target) def __str__(self) -> str: """Helper to output network architecture for the agent. Returns: A string representation of this algorithm. """ return ("{}\n{}\n{}\n{}".format("Q-Network (Local):", self.qnetwork_local, "Q-Network (Target):", self.qnetwork_target)) def origin(self) -> str: """Helper to get the original paper for this algorithm. Returns: The original paper for this algorithm. """ return 'https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf' def description(self) -> str: """Helper to get a brief description of this algorithm. Returns: A brief description of this algorithm. """ description = ( 'DQN is an algorithm created by DeepMind that brings together the power ' 'of the Q-Learning algorithm with the advantages of generalization through ' 'function approximation. It uses a deep neural network to estimate a Q-value ' 'function. As such, the input to the network is the current state of the ' 'environment, and the output is the Q-value for each possible action.' ) return description def step(self, state, action, reward, next_state, done, logger=None) -> None: """Saves experience to replay memory and updates model weights. Args: state: Environment states. action: Environment actions. reward: Rewards for the actions above. next_state: Next environment states. done (bool): Boolean indicating if the environment has terminated. logger (Logger): An instance of Logger. """ self.memory.add(state, action, reward, next_state, done) # Learn every `learn_every` time steps self.time_step += 1 if self.time_step % self.LEARN_EVERY == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, logger=logger) def act(self, state, eps=0.0, add_noise=False, logger=None): """Returns actions for given state as per current policy. Args: state: The current state of the environment. eps (float): Epsilon, for Epsilon-greedy action selection. add_noise (bool): Controls addition of noise. logger (Logger): An instance of Logger. Returns: Actions for given state as per current policy. """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, logger=None) -> None: """Updates value parameters using given batch of experience tuples. Args: experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples. logger (Logger): An instance of Logger. """ states, actions, rewards, next_states, dones = experiences if self.opt_ddqn: # Double DQN non_final_next_states = next_states * (1 - dones) # Get the actions themselves, not their output value _, next_state_actions = self.qnetwork_local( non_final_next_states).max(1, keepdim=True) next_Q_targets = self.qnetwork_target( non_final_next_states).gather(1, next_state_actions) target_Q = rewards + (self.GAMMA * next_Q_targets * (1 - dones)) else: # Vanilla DQN next_max_a = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) target_Q = rewards + (self.GAMMA * next_max_a * (1 - dones)) expected_Q = self.qnetwork_local(states) if len(actions.shape) == 1: actions = actions.unsqueeze(1) expected_Q = torch.gather(expected_Q, 1, actions.long()) # Compute and minimize the loss loss = F.mse_loss(expected_Q, target_Q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network if self.opt_soft_update: soft_update(self.qnetwork_local, self.qnetwork_target, self.TAU) elif self.time_step % self.HARD_UPDATE_EVERY == 0: hard_update(self.qnetwork_local, self.qnetwork_target) if logger: loss = loss.cpu().detach().item() logger.add_scalar('loss', loss, self.time_step)
def __init__(self, state_size: int, action_size: int, num_agents: int, actor_local=None, actor_target=None, actor_optimizer=None, critic_local=None, critic_target=None, critic_optimizer=None, new_hyperparameters=None, seed: int = 0, device: str = "cpu", model_output_dir: str = None, enable_logger: bool = False, logger_path: str = None, logger_comment: str = None, opt_soft_update: bool = False): """Initialize an DDPGAgent object. Args: state_size (int): dimension of each state. action_size (int): dimension of each action. num_agents (int): number of agents in the environment. actor_local (torch.nn.Module): Local Actor model. actor_target (torch.nn.Module): Target Actor model. actor_optimizer (torch.optim): Actor optimizer. critic_local (torch.nn.Module): Local Critic model. critic_target (torch.nn.Module): Target Critic model. critic_optimizer (torch.optim): Critic optimizer. new_hyperparameters (dict): New hyperparameter values. seed (int): Random seed. device (str): Identifier for device to be used by PyTorch. model_output_dir (str): Directory where state dicts will be saved to. opt_soft_update (bool): Use soft update instead of hard update. Returns: An instance of DDPGAgent. """ super(DDPGAgent, self).__init__(new_hyperparameters=new_hyperparameters, enable_logger=enable_logger, logger_path=logger_path, logger_comment=logger_comment) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.time_step = 0 # Actor Network (w/ Target Network) self.actor_local = actor_local if actor_local else Actor( state_size, action_size, seed).to(device) self.actor_target = actor_target if actor_target else Actor( state_size, action_size, seed).to(device) self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam( self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR) # Critic Network (w/ Target Network) self.critic_local = critic_local if critic_local else Critic( state_size, action_size, seed).to(device) self.critic_target = critic_target if critic_target else Critic( state_size, action_size, seed).to(device) self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam( self.critic_local.parameters(), lr=self.LEARNING_RATE_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.model_output_dir = model_output_dir self.state_dicts = [ (self.actor_local, "actor_local_params"), (self.actor_optimizer, "actor_optimizer_params"), (self.critic_local, "critic_local_params"), (self.critic_optimizer, "critic_optimizer_params"), ] # Ensure local and target networks have the same initial weight hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target)
class DDPGAgent(Agent): """DDPG Agent implementation.""" REQUIRED_HYPERPARAMETERS = { "buffer_size": int(1e6), "batch_size": 64, "gamma": 0.99, "tau": 1e-3, "learning_rate_actor": 1e-4, "learning_rate_critic": 1e-3, "weight_decay": 1e-2, "learn_every": 4, "hard_update_every": 4 } def __init__(self, state_size: int, action_size: int, num_agents: int, actor_local=None, actor_target=None, actor_optimizer=None, critic_local=None, critic_target=None, critic_optimizer=None, new_hyperparameters=None, seed: int = 0, device: str = "cpu", model_output_dir: str = None, enable_logger: bool = False, logger_path: str = None, logger_comment: str = None, opt_soft_update: bool = False): """Initialize an DDPGAgent object. Args: state_size (int): dimension of each state. action_size (int): dimension of each action. num_agents (int): number of agents in the environment. actor_local (torch.nn.Module): Local Actor model. actor_target (torch.nn.Module): Target Actor model. actor_optimizer (torch.optim): Actor optimizer. critic_local (torch.nn.Module): Local Critic model. critic_target (torch.nn.Module): Target Critic model. critic_optimizer (torch.optim): Critic optimizer. new_hyperparameters (dict): New hyperparameter values. seed (int): Random seed. device (str): Identifier for device to be used by PyTorch. model_output_dir (str): Directory where state dicts will be saved to. opt_soft_update (bool): Use soft update instead of hard update. Returns: An instance of DDPGAgent. """ super(DDPGAgent, self).__init__(new_hyperparameters=new_hyperparameters, enable_logger=enable_logger, logger_path=logger_path, logger_comment=logger_comment) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.time_step = 0 # Actor Network (w/ Target Network) self.actor_local = actor_local if actor_local else Actor( state_size, action_size, seed).to(device) self.actor_target = actor_target if actor_target else Actor( state_size, action_size, seed).to(device) self.actor_optimizer = actor_optimizer if actor_optimizer else optim.Adam( self.actor_local.parameters(), lr=self.LEARNING_RATE_ACTOR) # Critic Network (w/ Target Network) self.critic_local = critic_local if critic_local else Critic( state_size, action_size, seed).to(device) self.critic_target = critic_target if critic_target else Critic( state_size, action_size, seed).to(device) self.critic_optimizer = critic_optimizer if critic_optimizer else optim.Adam( self.critic_local.parameters(), lr=self.LEARNING_RATE_CRITIC, weight_decay=self.WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.model_output_dir = model_output_dir self.state_dicts = [ (self.actor_local, "actor_local_params"), (self.actor_optimizer, "actor_optimizer_params"), (self.critic_local, "critic_local_params"), (self.critic_optimizer, "critic_optimizer_params"), ] # Ensure local and target networks have the same initial weight hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target) def __str__(self) -> str: """Helper to output network architecture for the agent. Returns: A string representation of this algorithm. """ return ("{}\n{}\n{}\n{}\n{}\n{}\n{}\n{}".format( "Actor (Local):", self.actor_local, "Actor (Target):", self.actor_target, "Critic (Local):", self.critic_local, "Critic (Target):", self.critic_target)) def origin(self) -> str: """Helper to get the original paper for this algorithm. Returns: The original paper for this algorithm. """ return 'https://arxiv.org/pdf/1509.02971.pdf' def description(self) -> str: """Helper to get a brief description of this algorithm. Returns: A brief description of this algorithm. """ description = ( 'DDPG was introduced as an actor-critic method that performs well ' 'in environments with a continuous action space, which is a known ' 'limitation of the popular DQN algorithm. It improves on the ' 'deterministic policy gradient (DPG) algorithm by using a neural ' 'network to take advantage of generalization and function approximation.' ) return description def step(self, states, actions, rewards, next_states, dones, logger=None) -> None: """Save experience in replay memory, and use random sample from buffer to learn. Args: states: Environment states. actions: Environment actions. rewards: Rewards for the actions above. next_states: Next environment states. dones (bool): Boolean indicating if the environment has terminated. logger (Logger): An instance of Logger. """ if self.num_agents == 1: self.memory.add(states, actions, rewards, next_states, dones) else: # TODO: Refactor this to not assume that the objects come in correct shape for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) # Learn every `learn_every` time steps self.time_step += 1 if self.time_step % self.LEARN_EVERY == 0: if len(self.memory) > self.BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, logger=logger) def act(self, state, add_noise: bool = True, logger=None): """Chooses an action for the current state based on the current policy. Args: state: The current state of the environment. add_noise (bool): Controls addition of noise. logger (Logger): An instance of Logger. Returns: Actions for given state as per current policy. """ state = torch.from_numpy(state).float().to(self.device) if self.num_agents == 1: self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() # TODO: Have parameter that controls this? # return np.clip(action, -1, 1) return action else: actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, s in enumerate(state): # Populate list of actions one state at a time actions[i, :] = self.actor_local(s).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() # TODO: Have parameter that controls this? # return np.clip(action, -1, 1) return actions def learn(self, experiences, logger=None) -> None: """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done) tuples. logger (Logger): An instance of Logger. """ states, actions, rewards, next_states, dones = experiences ### Update critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer.step() ### Update actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ### Update target networks if self.opt_soft_update: soft_update(self.actor_local, self.actor_target, self.TAU) soft_update(self.critic_local, self.critic_target, self.TAU) elif self.time_step % self.HARD_UPDATE_EVERY == 0: hard_update(self.actor_local, self.actor_target) hard_update(self.critic_local, self.critic_target) if logger: actor_loss = actor_loss.cpu().detach().item() critic_loss = critic_loss.cpu().detach().item() logger.add_scalars('loss', { "actor loss": actor_loss, "critic loss": critic_loss, }, self.time_step)
class MADDPGAgent(Agent): """MADDPG implementation.""" REQUIRED_HYPERPARAMETERS = { "buffer_size": int(1e6), "batch_size": 64, "gamma": 0.99, "tau": 1e-3, "learning_rate_actor": 1e-4, "learning_rate_critic": 1e-3, "weight_decay": 1e-2, "learn_every": 4, "hard_update_every": 5 } def __init__(self, state_size, action_size, num_agents, agents=None, new_hyperparameters=None, seed=0, device="cpu", model_output_dir=None, enable_logger=False, logger_path=None, logger_comment=None, opt_soft_update=False): """Initialize a MADDPGAgent wrapper. Args: state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): the number of agents in the environment """ raise NotImplementedError() super(DDPG, self).__init__( new_hyperparameters=new_hyperparameters, enable_logger=enable_logger, logger_path=logger_path, logger_comment=logger_comment ) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.time_step = 0 if agents: self.agents = agents else: self.agents = [DDPGAgent(state_size, action_size, agent_id=i+1, handler=self) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(self.BUFFER_SIZE, self.BATCH_SIZE, self.device, seed) # User options self.opt_soft_update = opt_soft_update self.model_output_dir = model_output_dir def reset(self): """Resets OU Noise for each agent.""" for agent in self.agents: agent.reset() def act(self, observations, add_noise=False, logger=None): """Picks an action for each agent given their individual observations and the current policy.""" actions = [] for agent, observation in zip(self.agents, observations): action = agent.act(observation, add_noise=add_noise) actions.append(action) return np.array(actions) def step(self, observations, actions, rewards, next_observations, dones, logger=None): """Save experience in replay memory, and use random sample from buffer to learn.""" observations = observations.reshape(1, -1) actions = actions.reshape(1, -1) next_observations = next_observations.reshape(1, -1) self.memory.add(observations, actions, rewards, next_observations, dones) # Learn every `learn_every` time steps self.time_step += 1 if self.time_step % self.LEARN_EVERY == 0: if len(self.memory) > self.BATCH_SIZE: for a_i, agent in enumerate(self.agents): experiences = self.memory.sample() self.learn(experiences, a_i, logger=logger) def learn(self, experiences, agent_number, logger=None): """Helper to pick actions from each agent for the `experiences` tuple that will be used to update the weights to agent with ID = `agent_number`. Each observation in the `experiences` tuple contains observations from each agent, so before using the tuple of update the weights of an agent, we need all agents to contribute in generating `next_actions` and `actions_pred`. This happens because the critic will take as its input the combined observations and actions from all agents.""" next_actions = [] actions_pred = [] states, _, _, next_states, _ = experiences next_states = next_states.reshape(-1, self.num_agents, self.state_size) states = states.reshape(-1, self.num_agents, self.state_size) for a_i, agent in enumerate(self.agents): agent_id_tensor = self._get_agent_number(a_i) state = states.index_select(1, agent_id_tensor).squeeze(1) next_state = next_states.index_select(1, agent_id_tensor).squeeze(1) next_actions.append(agent.actor_target(next_state)) actions_pred.append(agent.actor_local(state)) next_actions = torch.cat(next_actions, dim=1).to(device) actions_pred = torch.cat(actions_pred, dim=1).to(device) agent = self.agents[agent_number] agent.learn(experiences, next_actions, actions_pred, logger=logger) def _get_agent_number(self, i): """Helper to get an agent's number as a Torch tensor.""" return torch.tensor([i]).to(device)
def setUp(self): self.batch_size = 2 self.replay_buffer = ReplayBuffer(10, self.batch_size, "cpu") self.populate_replay_buffer()