class MAgent():
    def __init__(self, state_size, action_size, num_agents, random_seed,
                 shared_replay_buffer):

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.shared_replay_buffer = shared_replay_buffer

        self.t_step = 0

        if shared_replay_buffer:
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       random_seed)
            shared_memory = self.memory
        else:
            shared_memory = None
            self.memory = None

        print("ma shared_memory -> ", shared_memory)

        self.ddpg_agents = [
            Agent(state_size, action_size, random_seed, shared_memory)
            for _ in range(num_agents)
        ]
#         print("MAgent: number of agents: ->", num_agents)
#         print("Enter into ddpg Agent")

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()

    def act(self, all_states):
        """get actions from all agents in the MADDPG object"""
        actions = [
            agent.act(np.expand_dims(states, axis=0))
            for agent, states in zip(self.ddpg_agents, all_states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY

        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            #             print(len(self.memory))
            if len(self.memory) > BATCH_SIZE:
                for agent in self.ddpg_agents:
                    if self.shared_replay_buffer:
                        experiences = self.memory.sample()
                    else:
                        experiences = agent.memory.sample()

                    agent.learn(experiences, GAMMA)
示例#2
0
class MADDPGAgent:
    def __init__(self, state_size, action_size, num_agents, random_seed):

        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)

        self.agents = [Agent(state_size, action_size, random_seed)
                       ] * num_agents
        self.shared_memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                          random_seed)

    def step(self, states, actions, rewards, next_states, dones, step):
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.shared_memory.add(state, action, reward, next_state, done)

        if len(self.shared_memory) > BATCH_SIZE and step % LEARN_EVERY == 0:
            for _ in range(LEARN_N_TIMES):
                for agent in self.agents:
                    experiences = self.shared_memory.sample()
                    agent.learn(experiences, GAMMA)

    def act(self, states, add_noise=True):
        actions = []
        for state, agent in zip(states, self.agents):
            state = np.expand_dims(state, axis=0)
            action = agent.act(state)
            action = np.reshape(action, newshape=(-1))
            actions.append(action)
        actions = np.stack(actions)
        return actions

    def save_weights(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       'checkpoint_actor_' + str(i) + '.pth')
            torch.save(agent.critic_local.state_dict(),
                       'checkpoint_critic_' + str(i) + '.pth')

    def load_weights(self):
        for i, agent in enumerate(self.agents):
            agent.actor_local.load_state_dict(
                torch.load('checkpoint_actor_' + str(i) + '.pth'))
            agent.critic_local.load_state_dict(
                torch.load('checkpoint_critic_' + str(i) + '.pth'))

    def reset(self):
        for agent in self.agents:
            agent.reset()
class MADDPG:
    def __init__(self, config):
        self.config = config
        # Replay memory
        self.memory = ReplayBuffer(self.config.action_size,
                                   self.config.buffer_size,
                                   self.config.batch_size, self.config.seed)
        self.agents = [
            Agent(self.config) for _ in range(self.config.num_agents)
        ]
        # 'action_size', 'num_agents', and 'random_seed'
        #self.agents = [Agent(self.config, self.config.action_size, self.config.num_agents, self.config.random_seed) for _ in range(self.config.num_agents)]
        self.t_step = 0
        self.loss = (0.0, 0.0)

    def reset(self):
        for agent in self.agents:
            agent.reset()

    def act(self, states, add_noise=True):
        actions = [
            agent.act(state, self.t_step, add_noise)
            for agent, state in zip(self.agents, states)
        ]
        return actions

    def step(self, states, actions, rewards, next_states, dones):
        # Save experience in replay memory
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step += 1
        if self.t_step % self.config.update_every == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.batch_size:
                closs = []
                aloss = []
                for agent in self.agents:
                    experiences = self.memory.sample()
                    critic_loss, actor_loss = agent.learn(
                        experiences, self.config.discount)
                    closs.append(critic_loss)
                    aloss.append(actor_loss)
                self.loss = (np.mean(closs), np.mean(aloss))
示例#4
0
class maddpg_agent:
    """Wrapper class managing different agents in the environment."""

    def __init__(self, num_agents=2, state_size=24, action_size=2):
        """Initialize a maddpg_agent wrapper.
        Params
        ======
            num_agents (int): the number of agents in the environment
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        
        self.agents = [ddpg_agent(state_size, action_size, i+1, random_seed=0) for i in range(num_agents)]
        
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed=0)
        
    def reset(self):
        """Resets OU Noise for each agent."""
        for agent in self.agents:
            agent.reset()
            
    def act(self, observations, add_noise=False):
        """Picks an action for each agent given."""
        actions = []
        for agent, observation in zip(self.agents, observations):
            action = agent.act(observation, add_noise=add_noise)
            actions.append(action)
        return np.array(actions)
    
    def step(self, states, actions, rewards, next_states, dones, timestep):
        """Save experience in replay memory."""
        states = states.reshape(1, -1)
        actions = actions.reshape(1, -1)
        next_states = next_states.reshape(1, -1)
        
        self.memory.add(states, actions, rewards, next_states, dones)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and timestep%LEARNING_PERIOD == 0:
            for a_i, agent in enumerate(self.agents):
                experiences = self.memory.sample()
                self.learn(experiences, a_i)
            
    def learn(self, experiences, agent_number):
        """ The critic takes as its input the combined observations and 
        actions from all agents. Collect actions from each agent for the 'experiences'. """
        next_actions = []
        actions_pred = []
        states, _, _, next_states, _ = experiences
        
        next_states = next_states.reshape(-1, self.num_agents, self.state_size)
        states = states.reshape(-1, self.num_agents, self.state_size)
        
        for a_i, agent in enumerate(self.agents):
            agent_id_tensor = self._get_agent_number(a_i)
            
            state = states.index_select(1, agent_id_tensor).squeeze(1)
            next_state = next_states.index_select(1, agent_id_tensor).squeeze(1)
            
            next_actions.append(agent.actor_target(next_state))
            actions_pred.append(agent.actor_local(state))
            
        next_actions = torch.cat(next_actions, dim=1).to(device)
        actions_pred = torch.cat(actions_pred, dim=1).to(device)
        
        agent = self.agents[agent_number]
        agent.learn(experiences, next_actions, actions_pred)
                
    def _get_agent_number(self, i):
        """Helper to get an agent's number as a Torch tensor."""
        return torch.tensor([i]).to(device)
class MADDPG():
    """Agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self,
                 state_size=24,
                 action_size=2,
                 n_agents=2,
                 buffer_size=100000,
                 batch_size=256,
                 gamma=0.999,
                 update_every=4,
                 noise_start=1.0,
                 noise_decay=1.0,
                 t_stop_noise=30000,
                 seed=0):
        """
        Params
        ======
            action_size (int): dimension of each action
            n_agents (int): number of distinct agents
            buffer_size (int): replay buffer size
            batch_size (int): minibatch size
            gamma (float): discount factor
            noise_start (float): initial noise weighting factor
            noise_decay (float): noise decay rate
            update_every (int): how often to update the network
            t_stop_noise (int): max number of timesteps with noise applied in training
            seed (int): Random seed
        """

        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.update_every = update_every
        self.gamma = gamma
        self.n_agents = n_agents
        self.noise_weight = noise_start
        self.noise_decay = noise_decay
        self.t_step = 0
        self.noise_on = True
        self.t_stop_noise = t_stop_noise

        #         models = [model.Actor_Critic_Models(n_agents=n_agents) for _ in range(n_agents)]
        self.agents = [
            DDPG(i, state_size, action_size, n_agents) for i in range(n_agents)
        ]
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(1, -1)
        all_next_states = all_next_states.reshape(1, -1)
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        if self.t_step > self.t_stop_noise:
            self.noise_on = False

        self.t_step += 1
        if self.t_step % self.update_every == 0 and len(
                self.memory) > self.batch_size:
            experiences = [self.memory.sample() for _ in range(self.n_agents)]
            self.learn(experiences, self.gamma)

    def act(self, all_states, add_noise=True):
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state,
                               noise_weight=self.noise_weight,
                               add_noise=self.noise_on)
            self.noise_weight *= self.noise_decay
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(), f"actor_agent{i}.pth")
            torch.save(agent.critic_local.state_dict(), f"critic_agent{i}.pth")
示例#6
0
class MADDPG():
    def __init__(self, action_size=2, seed=42, n_agents=2):
        """
        Params
        ======
            action_size (int): dimension of each action
            seed (int): Random seed
            n_agents (int): number of distinct agents
        """

        self.n_agents = n_agents
        self.timestep = 0

        self.agents = [DDPG(i) for i in range(n_agents)]

        # common buffer for both the agents
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

    def step(self, all_states, all_actions, all_rewards, all_next_states,
             all_dones):
        all_states = all_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        all_next_states = all_next_states.reshape(
            1, -1)  # reshape 2x24 into 1x48 dim vector
        self.memory.add(all_states, all_actions, all_rewards, all_next_states,
                        all_dones)

        self.timestep += 1
        if self.timestep % 2 == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                # sample from the replay buffer for each agent
                experiences = [
                    self.memory.sample() for _ in range(self.n_agents)
                ]
                self.learn(experiences, GAMMA)

    def act(self, all_states, add_noise=True):
        # calculate each agents action
        all_actions = []
        for agent, state in zip(self.agents, all_states):
            action = agent.act(state, noise_weight=0.5, add_noise=True)
            all_actions.append(action)
        return np.array(all_actions).reshape(1, -1)

    def learn(self, experiences, gamma):
        # each agent uses its own actor to calculate next_actions
        all_next_actions = []
        all_actions = []
        for i, agent in enumerate(self.agents):
            states, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(device)
            # extract agent i's state and get action via actor network
            state = states.reshape(-1, 2, 24).index_select(1,
                                                           agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)
            # extract agent i's next state and get action via target actor network
            next_state = next_states.reshape(-1, 2, 24).index_select(
                1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        # each agent learns from its experience sample
        for i, agent in enumerate(self.agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)

    def save_agents(self):
        # save models
        for i, agent in enumerate(self.agents):
            torch.save(agent.actor_local.state_dict(),
                       f"checkpoint_actor_{i}.pth")
            torch.save(agent.critic_local.state_dict(),
                       f"checkpoint_critic_{i}.pth")
示例#7
0
class MultiAgent:
    """Meta agent that contains the two DDPG agents and shared replay buffer."""
    def __init__(self, config):
        self.config = config
        self.n_agents = config.env.n_agents
        self.ddpg_agents = [
            Agent(i, config) for i in range(self.config.env.n_agents)
        ]
        # the shared replay buffer
        self.memory = ReplayBuffer(config)
        self.t_step = 0

    def reset(self):
        for agent in self.ddpg_agents:
            agent.reset()

    def step(self, states, actions, rewards, next_states, dones):
        states = states.reshape(1, -1)
        next_states = next_states.reshape(1, -1)
        self.memory.add(states, actions, rewards, next_states, dones)

        self.t_step = (self.t_step + 1) % self.config.hp.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.config.hp.batch_size:
                for _ in range(self.config.hp.num_updates):
                    # each agent does it's own sampling from the replay buffer
                    experiences = [
                        self.memory.sample()
                        for _ in range(self.config.env.n_agents)
                    ]
                    self.learn(experiences, self.config.hp.gamma)

    def act(self, states, add_noise=True):
        # pass each agent's state from the environment and calculate it's action
        all_actions = []
        for agent, state in zip(self.ddpg_agents, states):
            action = agent.act(state, add_noise=True)
            all_actions.append(action)
        return np.array(all_actions).reshape(
            1, -1)  # reshape 2x2 into 1x4 dim vector

    def learn(self, experiences, gamma):
        # each agent uses it's own actor to calculate next_actions
        all_next_actions = []
        for i, agent in enumerate(self.ddpg_agents):
            _, _, _, next_states, _ = experiences[i]
            agent_id = torch.tensor([i]).to(self.config.general.device)
            next_state = next_states.reshape(-1, self.config.env.action_size, self.config.env.state_size) \
                        .index_select(1, agent_id).squeeze(1)
            next_action = agent.actor_target(next_state)
            all_next_actions.append(next_action)

        # each agent uses it's own actor to calculate actions
        all_actions = []
        for i, agent in enumerate(self.ddpg_agents):
            states, _, _, _, _ = experiences[i]
            agent_id = torch.tensor([i]).to(self.config.general.device)
            state = states.reshape(-1, self.config.env.action_size, self.config.env.state_size)\
                    .index_select(1, agent_id).squeeze(1)
            action = agent.actor_local(state)
            all_actions.append(action)

        # each agent learns from it's experience sample
        for i, agent in enumerate(self.ddpg_agents):
            agent.learn(i, experiences[i], gamma, all_next_actions,
                        all_actions)