class MAgent(): def __init__(self, state_size, action_size, num_agents, random_seed, shared_replay_buffer): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.shared_replay_buffer = shared_replay_buffer self.t_step = 0 if shared_replay_buffer: self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) shared_memory = self.memory else: shared_memory = None self.memory = None print("ma shared_memory -> ", shared_memory) self.ddpg_agents = [ Agent(state_size, action_size, random_seed, shared_memory) for _ in range(num_agents) ] # print("MAgent: number of agents: ->", num_agents) # print("Enter into ddpg Agent") def reset(self): for agent in self.ddpg_agents: agent.reset() def act(self, all_states): """get actions from all agents in the MADDPG object""" actions = [ agent.act(np.expand_dims(states, axis=0)) for agent, states in zip(self.ddpg_agents, all_states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn # print(len(self.memory)) if len(self.memory) > BATCH_SIZE: for agent in self.ddpg_agents: if self.shared_replay_buffer: experiences = self.memory.sample() else: experiences = agent.memory.sample() agent.learn(experiences, GAMMA)
class MADDPGAgent: def __init__(self, state_size, action_size, num_agents, random_seed): self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.agents = [Agent(state_size, action_size, random_seed) ] * num_agents self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, step): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.shared_memory.add(state, action, reward, next_state, done) if len(self.shared_memory) > BATCH_SIZE and step % LEARN_EVERY == 0: for _ in range(LEARN_N_TIMES): for agent in self.agents: experiences = self.shared_memory.sample() agent.learn(experiences, GAMMA) def act(self, states, add_noise=True): actions = [] for state, agent in zip(states, self.agents): state = np.expand_dims(state, axis=0) action = agent.act(state) action = np.reshape(action, newshape=(-1)) actions.append(action) actions = np.stack(actions) return actions def save_weights(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), 'checkpoint_actor_' + str(i) + '.pth') torch.save(agent.critic_local.state_dict(), 'checkpoint_critic_' + str(i) + '.pth') def load_weights(self): for i, agent in enumerate(self.agents): agent.actor_local.load_state_dict( torch.load('checkpoint_actor_' + str(i) + '.pth')) agent.critic_local.load_state_dict( torch.load('checkpoint_critic_' + str(i) + '.pth')) def reset(self): for agent in self.agents: agent.reset()
class MADDPG: def __init__(self, config): self.config = config # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.seed) self.agents = [ Agent(self.config) for _ in range(self.config.num_agents) ] # 'action_size', 'num_agents', and 'random_seed' #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)] self.t_step = 0 self.loss = (0.0, 0.0) def reset(self): for agent in self.agents: agent.reset() def act(self, states, add_noise=True): actions = [ agent.act(state, self.t_step, add_noise) for agent, state in zip(self.agents, states) ] return actions def step(self, states, actions, rewards, next_states, dones): # Save experience in replay memory for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step += 1 if self.t_step % self.config.update_every == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.batch_size: closs = [] aloss = [] for agent in self.agents: experiences = self.memory.sample() critic_loss, actor_loss = agent.learn( experiences, self.config.discount) closs.append(critic_loss) aloss.append(actor_loss) self.loss = (np.mean(closs), np.mean(aloss))
class maddpg_agent: """Wrapper class managing different agents in the environment.""" def __init__(self, num_agents=2, state_size=24, action_size=2): """Initialize a maddpg_agent wrapper. Params ====== num_agents (int): the number of agents in the environment state_size (int): dimension of each state action_size (int): dimension of each action """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)] # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0) def reset(self): """Resets OU Noise for each agent.""" for agent in self.agents: agent.reset() def act(self, observations, add_noise=False): """Picks an action for each agent given.""" actions = [] for agent, observation in zip(self.agents, observations): action = agent.act(observation, add_noise=add_noise) actions.append(action) return np.array(actions) def step(self, states, actions, rewards, next_states, dones, timestep): """Save experience in replay memory.""" states = states.reshape(1, -1) actions = actions.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep%LEARNING_PERIOD == 0: for a_i, agent in enumerate(self.agents): experiences = self.memory.sample() self.learn(experiences, a_i) def learn(self, experiences, agent_number): """ The critic takes as its input the combined observations and actions from all agents. Collect actions from each agent for the 'experiences'. """ next_actions = [] actions_pred = [] states, _, _, next_states, _ = experiences next_states = next_states.reshape(-1, self.num_agents, self.state_size) states = states.reshape(-1, self.num_agents, self.state_size) for a_i, agent in enumerate(self.agents): agent_id_tensor = self._get_agent_number(a_i) state = states.index_select(1, agent_id_tensor).squeeze(1) next_state = next_states.index_select(1, agent_id_tensor).squeeze(1) next_actions.append(agent.actor_target(next_state)) actions_pred.append(agent.actor_local(state)) next_actions = torch.cat(next_actions, dim=1).to(device) actions_pred = torch.cat(actions_pred, dim=1).to(device) agent = self.agents[agent_number] agent.learn(experiences, next_actions, actions_pred) def _get_agent_number(self, i): """Helper to get an agent's number as a Torch tensor.""" return torch.tensor([i]).to(device)
class MADDPG(): """Agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, state_size=24, action_size=2, n_agents=2, buffer_size=100000, batch_size=256, gamma=0.999, update_every=4, noise_start=1.0, noise_decay=1.0, t_stop_noise=30000, seed=0): """ Params ====== action_size (int): dimension of each action n_agents (int): number of distinct agents buffer_size (int): replay buffer size batch_size (int): minibatch size gamma (float): discount factor noise_start (float): initial noise weighting factor noise_decay (float): noise decay rate update_every (int): how often to update the network t_stop_noise (int): max number of timesteps with noise applied in training seed (int): Random seed """ self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.gamma = gamma self.n_agents = n_agents self.noise_weight = noise_start self.noise_decay = noise_decay self.t_step = 0 self.noise_on = True self.t_stop_noise = t_stop_noise # models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)] self.agents = [ DDPG(i, state_size, action_size, n_agents) for i in range(n_agents) ] self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape(1, -1) all_next_states = all_next_states.reshape(1, -1) self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) if self.t_step > self.t_stop_noise: self.noise_on = False self.t_step += 1 if self.t_step % self.update_every == 0 and len( self.memory) > self.batch_size: experiences = [self.memory.sample() for _ in range(self.n_agents)] self.learn(experiences, self.gamma) def act(self, all_states, add_noise=True): all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, noise_weight=self.noise_weight, add_noise=self.noise_on) self.noise_weight *= self.noise_decay all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"actor_agent{i}.pth") torch.save(agent.critic_local.state_dict(), f"critic_agent{i}.pth")
class MADDPG(): def __init__(self, action_size=2, seed=42, n_agents=2): """ Params ====== action_size (int): dimension of each action seed (int): Random seed n_agents (int): number of distinct agents """ self.n_agents = n_agents self.timestep = 0 self.agents = [DDPG(i) for i in range(n_agents)] # common buffer for both the agents self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) def step(self, all_states, all_actions, all_rewards, all_next_states, all_dones): all_states = all_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector all_next_states = all_next_states.reshape( 1, -1) # reshape 2x24 into 1x48 dim vector self.memory.add(all_states, all_actions, all_rewards, all_next_states, all_dones) self.timestep += 1 if self.timestep % 2 == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: # sample from the replay buffer for each agent experiences = [ self.memory.sample() for _ in range(self.n_agents) ] self.learn(experiences, GAMMA) def act(self, all_states, add_noise=True): # calculate each agents action all_actions = [] for agent, state in zip(self.agents, all_states): action = agent.act(state, noise_weight=0.5, add_noise=True) all_actions.append(action) return np.array(all_actions).reshape(1, -1) def learn(self, experiences, gamma): # each agent uses its own actor to calculate next_actions all_next_actions = [] all_actions = [] for i, agent in enumerate(self.agents): states, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(device) # extract agent i's state and get action via actor network state = states.reshape(-1, 2, 24).index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # extract agent i's next state and get action via target actor network next_state = next_states.reshape(-1, 2, 24).index_select( 1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) # each agent learns from its experience sample for i, agent in enumerate(self.agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions) def save_agents(self): # save models for i, agent in enumerate(self.agents): torch.save(agent.actor_local.state_dict(), f"checkpoint_actor_{i}.pth") torch.save(agent.critic_local.state_dict(), f"checkpoint_critic_{i}.pth")
class MultiAgent: """Meta agent that contains the two DDPG agents and shared replay buffer.""" def __init__(self, config): self.config = config self.n_agents = config.env.n_agents self.ddpg_agents = [ Agent(i, config) for i in range(self.config.env.n_agents) ] # the shared replay buffer self.memory = ReplayBuffer(config) self.t_step = 0 def reset(self): for agent in self.ddpg_agents: agent.reset() def step(self, states, actions, rewards, next_states, dones): states = states.reshape(1, -1) next_states = next_states.reshape(1, -1) self.memory.add(states, actions, rewards, next_states, dones) self.t_step = (self.t_step + 1) % self.config.hp.update_every if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.config.hp.batch_size: for _ in range(self.config.hp.num_updates): # each agent does it's own sampling from the replay buffer experiences = [ self.memory.sample() for _ in range(self.config.env.n_agents) ] self.learn(experiences, self.config.hp.gamma) def act(self, states, add_noise=True): # pass each agent's state from the environment and calculate it's action all_actions = [] for agent, state in zip(self.ddpg_agents, states): action = agent.act(state, add_noise=True) all_actions.append(action) return np.array(all_actions).reshape( 1, -1) # reshape 2x2 into 1x4 dim vector def learn(self, experiences, gamma): # each agent uses it's own actor to calculate next_actions all_next_actions = [] for i, agent in enumerate(self.ddpg_agents): _, _, _, next_states, _ = experiences[i] agent_id = torch.tensor([i]).to(self.config.general.device) next_state = next_states.reshape(-1, self.config.env.action_size, self.config.env.state_size) \ .index_select(1, agent_id).squeeze(1) next_action = agent.actor_target(next_state) all_next_actions.append(next_action) # each agent uses it's own actor to calculate actions all_actions = [] for i, agent in enumerate(self.ddpg_agents): states, _, _, _, _ = experiences[i] agent_id = torch.tensor([i]).to(self.config.general.device) state = states.reshape(-1, self.config.env.action_size, self.config.env.state_size)\ .index_select(1, agent_id).squeeze(1) action = agent.actor_local(state) all_actions.append(action) # each agent learns from it's experience sample for i, agent in enumerate(self.ddpg_agents): agent.learn(i, experiences[i], gamma, all_next_actions, all_actions)