class DDPGAgent(): def __init__(self, state_size, action_size, par): self.par = par # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, par).to(device) self.actor_target = Actor(state_size, action_size, par).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=par.lr_actor) print('actor') print(self.actor_local) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, par).to(device) self.critic_target = Critic(state_size, action_size, par).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=par.lr_critic, weight_decay=par.weight_decay) # Noise process self.noise = OUNoise(action_size, par.random_seed, par.ou_mu, par.ou_theta, par.ou_sigma) def save_model(self, experiment_name, i_episode): path = self.par.save_path torch.save( self.actor_local.state_dict(), experiment_name + '_checkpoint_actor_' + str(i_episode) + '.pth') torch.save( self.critic_local.state_dict(), experiment_name + '_checkpoint_critic_' + str(i_episode) + '.pth')
class Agent(AgentABC): def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an DDPG Agent object. :param state_size (int): dimension of each state :param action_size (int): dimension of each action :param num_agents (int): number of agents in environment ot use ddpg :param random_seed (int): random seed """ super().__init__(state_size, action_size, num_agents, random_seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for each agent self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # debug of the MSE critic loss self.mse_error_list = [] def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for agent in range(self.num_agents): self.memory.add(states[agent, :], actions[agent, :], rewards[agent], next_states[agent, :], dones[agent]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences) self.debug_loss = np.mean(self.mse_error_list) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) acts = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent in range(self.num_agents): acts[agent, :] = self.actor_local( state[agent, :]).cpu().data.numpy() self.actor_local.train() if add_noise: noise = self.noise.sample() acts += noise return np.clip(acts, -1, 1) def reset(self): """ see abstract class """ super().reset() self.noise.reset() self.mse_error_list = [] def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards.view(BATCH_SIZE, -1) + (GAMMA * Q_targets_next * (1 - dones).view(BATCH_SIZE, -1)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.mse_error_list.append(critic_loss.detach().cpu().numpy()) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load_weights(self, directory_path): """ see abstract class """ super().load_weights(directory_path) self.actor_target.load_state_dict( torch.load(os.path.join(directory_path, an_filename), map_location=device)) self.critic_target.load_state_dict( torch.load(os.path.join(directory_path, cn_filename), map_location=device)) self.actor_local.load_state_dict( torch.load(os.path.join(directory_path, an_filename), map_location=device)) self.critic_local.load_state_dict( torch.load(os.path.join(directory_path, cn_filename), map_location=device)) def save_weights(self, directory_path): """ see abstract class """ super().save_weights(directory_path) torch.save(self.actor_local.state_dict(), os.path.join(directory_path, an_filename)) torch.save(self.critic_local.state_dict(), os.path.join(directory_path, cn_filename)) def save_mem(self, directory_path): """ see abstract class """ super().save_mem(directory_path) self.memory.save(os.path.join(directory_path, "ddpg_memory")) def load_mem(self, directory_path): """ see abstract class """ super().load_mem(directory_path) self.memory.load(os.path.join(directory_path, "ddpg_memory"))
class MADDPGAgent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) # Actor Networks (Local and Target) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Networks (Local and Target) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise Process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, REPLAY_BUFFER_SIZE, MINIBATCH_SIZE, random_seed) # Count t_steps self.time_step = 0 def step(self, state, action, reward, next_state, done, time_step): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > MINIBATCH_SIZE and time_step % UPDATE_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0., add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += eps * self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # clipping gradient to 1 for stable learning torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def create_agent(state_size, action_size, seed=0, actor_fc1_units=400, actor_fc2_units=300, actor_lr=1e-4, critic_fc1_units=400, critic_fc2_units=300, critic_lr=1e-4, weight_decay=0, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=0.1, noise_dev=0.3): """ This function creates an agent with specified parameters for training. Arguments: state_size: An integer count of dimensions for each state. action_size: An integer count of dimensions for each action. seed: Random seed specified to diversify results. actor_fc1_units: An integer number of units used in the first FC layer for the Actor object. actor_fc2_units: An integer number of units used in the second FC layer for the Actor object. actor_lr: A float designating the learning rate of the Actor's optimizer. critic_fc1_units: An integer number of units used in the first FC layer for the Critic object. critic_fc2_units: An integer number of units used in the second FC layer for the Critic object. critic_lr: A float designating the learning rate of the Critic's optimizer. weight_decay: Float multiplicative factor to stabilize complexity penalization. buffer_size: An integer for replay buffer size. batch_size: An integer for minibatch size. gamma: A float designating the discount factor. tau: A float designating multiplication factor for soft update of target parameters. noise_dev: Float designating the noise to be added to action decisions. Returns: agent: An Agent object used for training. """ # Initialize the replay buffer from which experiences are gathered for # training the agent. replay_buffer = ReplayBuffer(seed=seed, buffer_size=buffer_size, batch_size=batch_size) # Initialize local and target Actor Networks and optimizer. actor_local = Actor(state_size, action_size, seed, actor_fc1_units, actor_fc2_units).to(device) actor_target = Actor(state_size, action_size, seed, actor_fc1_units, actor_fc2_units).to(device) actor_optimizer = optim.Adam(actor_local.parameters(), lr=actor_lr) # Initialize local and target Critic Networks and optimizer. critic_local = Critic(state_size, action_size, seed, critic_fc1_units, critic_fc2_units).to(device) critic_target = Critic(state_size, action_size, seed, critic_fc1_units, critic_fc2_units).to(device) critic_optimizer = optim.Adam(critic_local.parameters(), lr=critic_lr, weight_decay=weight_decay) # Initialize Gaussian noise to reduce generalization error. noise = GaussianNoise(action_size, seed, mu=0.0, sigma=noise_dev) # Create agent object used for training. agent = Agent(seed=seed, memory=replay_buffer, batch_size=batch_size, actor_local=actor_local, actor_target=actor_target, actor_optimizer=actor_optimizer, critic_local=critic_local, critic_target=critic_target, critic_optimizer=critic_optimizer, noise=noise, gamma=gamma, tau=tau) return agent