Python MultiAgentReplayBuffer 예제들, utils.MultiAgentReplayBuffer Python 예제들

예제 #1

0

파일 보기

파일: maddpg_ensemble.py 프로젝트: CenturyLiu/RL-Project-Tennis

 def __init__(self,number_agents, obs_dim, action_dim, buffer_maxlen = BUFFER_SIZE, num_sub_policy = 3, batch_size = BATCH_SIZE):
     self.num_agents = number_agents
     self.num_sub_policy = int(num_sub_policy)
     self.replay_buffer = [MultiAgentReplayBuffer(self.num_agents, buffer_maxlen) for _ in range(self.num_sub_policy)]
     self.agents = [DDPG_ENSEMBLE(number_agents, obs_dim, action_dim, i, num_sub_policy = self.num_sub_policy) for i in range(self.num_agents)]
     self.subpolicy_array = np.arange(self.num_sub_policy)
     self.batch_size = batch_size

예제 #2

0

파일 보기

 def __init__(self):
     super(MADDPG, self).__init__()
     self.env = make_env(scenario_name='simple_spread')
     self.num_agents = self.env.n
     self.replay_buffer = MultiAgentReplayBuffer(self.num_agents,
                                                 cfg.buffer_maxlen)
     self.agents = [
         DDPGAgent(self.env,
                   agent_id,
                   actor_lr=cfg.actor_lr,
                   critic_lr=cfg.critic_lr,
                   gamma=cfg.gamma) for agent_id in range(self.num_agents)
     ]
     self.episode_rewards = list()
     self.episode = 0
     self.episode_reward = 0
     self.populate(cfg.warm_start_steps)
     self.states = self.env.reset()
     self.reset()
     if not os.path.exists(os.path.join(os.getcwd(), 'saved_weights')):
         os.mkdir(os.path.join(os.getcwd(), 'saved_weights'))

예제 #3

0

파일 보기

 def __init__(self, env, buffer_maxlen):
     self.env = env
     self.num_agents = env.n
     self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, buffer_maxlen)
     self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]

예제 #4

0

파일 보기

class MADDPG:

    def __init__(self, env, buffer_maxlen):
        self.env = env
        self.num_agents = env.n
        self.replay_buffer = MultiAgentReplayBuffer(self.num_agents, buffer_maxlen)
        self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]

    def get_actions(self, states):
        actions = []
        for i in range(self.num_agents):
            action = self.agents[i].get_action(states[i])
            actions.append(action)
        return actions

    def update(self, batch_size):
        obs_batch, indiv_action_batch, indiv_reward_batch, next_obs_batch, \
            global_state_batch, global_actions_batch, global_next_state_batch, done_batch = self.replay_buffer.sample(batch_size)
        
        for i in range(self.num_agents):
            obs_batch_i = obs_batch[i]
            indiv_action_batch_i = indiv_action_batch[i]
            indiv_reward_batch_i = indiv_reward_batch[i]
            next_obs_batch_i = next_obs_batch[i]

            next_global_actions = []
            for agent in self.agents:
                next_obs_batch_i = torch.FloatTensor(next_obs_batch_i)
                indiv_next_action = agent.actor.forward(next_obs_batch_i)
                indiv_next_action = [agent.onehot_from_logits(indiv_next_action_j) for indiv_next_action_j in indiv_next_action]
                indiv_next_action = torch.stack(indiv_next_action)
                next_global_actions.append(indiv_next_action)
            next_global_actions = torch.cat([next_actions_i for next_actions_i in next_global_actions], 1)

            self.agents[i].update(indiv_reward_batch_i, obs_batch_i, global_state_batch, global_actions_batch, global_next_state_batch, next_global_actions)
            self.agents[i].target_update()
    
    def run(self, max_episode, max_steps, batch_size):
        episode_rewards = []
        for episode in range(max_episode):
            states = self.env.reset()
            episode_reward = 0
            for step in range(max_steps):
                actions = self.get_actions(states)
                #print(actions)
                next_states, rewards, dones, _ = self.env.step(actions)
                episode_reward += np.mean(rewards)
                #print(step)
                
                if all(dones) or step == max_steps - 1:
                    dones = [1 for _ in range(self.num_agents)]
                    self.replay_buffer.push(states, actions, rewards, next_states, dones)
                    episode_rewards.append(episode_reward)
                    print("episode: {}  |  reward: {}  \n".format(episode, np.round(episode_reward, decimals=4)))
                    break
                else:
                    dones = [0 for _ in range(self.num_agents)]
                    self.replay_buffer.push(states, actions, rewards, next_states, dones)
                    states = next_states 

                    if len(self.replay_buffer) > batch_size:
                        self.update(batch_size)

예제 #5

0

파일 보기

class MADDPG:
    def __init__(self, env, buffer_maxlen):
        self.env = env
        self.num_agents = env.red_agent_num
        self.replay_buffer = MultiAgentReplayBuffer(self.num_agents,
                                                    buffer_maxlen)
        self.agents = [DDPGAgent(self.env, i) for i in range(self.num_agents)]

    def get_actions(self, states):
        actions = []
        for i in range(self.num_agents):
            action = self.agents[i].get_action(states[i])
            actions.append(action)
        return actions

    def update(self, batch_size, writer, episode):
        obs_batch, indiv_action_batch, indiv_reward_batch, next_obs_batch, \
            global_state_batch, global_actions_batch, global_next_state_batch, done_batch = self.replay_buffer.sample(batch_size)

        for i in range(self.num_agents):
            obs_batch_i = obs_batch[i]
            indiv_action_batch_i = indiv_action_batch[i]
            indiv_reward_batch_i = indiv_reward_batch[i]
            next_obs_batch_i = next_obs_batch[i]

            next_global_actions = []
            for agent in self.agents:
                next_obs_batch_i = torch.FloatTensor(next_obs_batch_i)
                indiv_next_action = agent.actor.forward(next_obs_batch_i)
                indiv_next_action = [
                    agent.onehot_from_logits(indiv_next_action_j)
                    for indiv_next_action_j in indiv_next_action
                ]
                indiv_next_action = torch.stack(indiv_next_action)
                next_global_actions.append(indiv_next_action)
            next_global_actions = torch.cat(
                [next_actions_i for next_actions_i in next_global_actions], 1)

            self.agents[i].update(indiv_reward_batch_i, obs_batch_i,
                                  global_state_batch, global_actions_batch,
                                  global_next_state_batch, next_global_actions,
                                  writer, episode)
            self.agents[i].target_update()

예제 #6

0

파일 보기

class MADDPG(pl.LightningModule):
    def __init__(self):
        super(MADDPG, self).__init__()
        self.env = make_env(scenario_name='simple_spread')
        self.num_agents = self.env.n
        self.replay_buffer = MultiAgentReplayBuffer(self.num_agents,
                                                    cfg.buffer_maxlen)
        self.agents = [
            DDPGAgent(self.env,
                      agent_id,
                      actor_lr=cfg.actor_lr,
                      critic_lr=cfg.critic_lr,
                      gamma=cfg.gamma) for agent_id in range(self.num_agents)
        ]
        self.episode_rewards = list()
        self.episode = 0
        self.episode_reward = 0
        self.populate(cfg.warm_start_steps)
        self.states = self.env.reset()
        self.reset()
        if not os.path.exists(os.path.join(os.getcwd(), 'saved_weights')):
            os.mkdir(os.path.join(os.getcwd(), 'saved_weights'))

    def populate(self, steps=1000):
        states = self.env.reset()
        for i in range(steps):
            actions = self.get_actions(states)
            next_states, rewards, dones, _ = self.env.step(actions)
            self.replay_buffer.push(states, actions, rewards, next_states,
                                    dones)
            states = next_states

    def reset(self):
        self.states = self.env.reset()
        self.step = 0
        self.episode_reward = 0

    def forward(self):
        pass

    def training_step(self, batch, batch_idx, optimizer_idx):
        # Execution phase
        if optimizer_idx == 0:
            if self.current_epoch == 3000:
                torch.save(self.agents[0].actor.state_dict(),
                           './saved_weights/actor_3000.weights')
                torch.save(self.agents[0].critic.state_dict(),
                           './saved_weights/critic_3000.weights')
            elif self.current_epoch == 10000:
                torch.save(self.agents[0].actor.state_dict(),
                           './saved_weights/actor_10000.weights')
                torch.save(self.agents[0].critic.state_dict(),
                           './saved_weights/critic_10000.weights')
            elif self.current_epoch == 30000:
                torch.save(self.agents[0].actor.state_dict(),
                           './saved_weights/actor_30000.weights')
                torch.save(self.agents[0].critic.state_dict(),
                           './saved_weights/critic_30000.weights')
            actions = self.get_actions(self.states)
            next_states, rewards, dones, _ = self.env.step(actions)
            self.episode_reward += np.mean(rewards)

            if all(dones) or self.step == cfg.max_episode_len - 1:
                dones = [1 for _ in range(self.num_agents)]
                self.replay_buffer.push(self.states, actions, rewards,
                                        next_states, dones)
                self.episode_rewards.append(self.episode_reward)
                print()
                print(
                    f"global_step: {self.global_step}  |  episode: {self.episode}  |  step: {self.step}|  reward: {np.round(self.episode_reward, decimals=4)}  \n"
                )
                self.logger.experiment.add_scalar('episode_reward',
                                                  self.episode_reward,
                                                  self.episode)
                self.episode += 1
                self.reset()
            else:
                dones = [0 for _ in range(self.num_agents)]
                self.replay_buffer.push(self.states, actions, rewards,
                                        next_states, dones)
                self.states = next_states
                self.step += 1

        # Training phase
        obs_batch, indiv_action_batch, indiv_reward_batch, next_obs_batch, \
        global_state_batch, global_actions_batch, global_next_state_batch, \
        done_batch = batch

        agent_idx = optimizer_idx // 2

        obs_batch_i = obs_batch[agent_idx]
        indiv_action_batch_i = indiv_action_batch[agent_idx]
        indiv_reward_batch_i = indiv_reward_batch[agent_idx]
        next_obs_batch_i = next_obs_batch[agent_idx]
        next_global_actions = list()

        for agent in self.agents:
            if self.on_gpu:
                next_obs_batch_i = torch.cuda.FloatTensor(
                    next_obs_batch_i.float())
            else:
                next_obs_batch_i = torch.FloatTensor(next_obs_batch_i.float())

            indiv_next_action = agent.actor(next_obs_batch_i)
            indiv_next_action = [
                agent.onehot_from_logits(indiv_next_action_j)
                for indiv_next_action_j in indiv_next_action
            ]
            indiv_next_action = torch.stack(indiv_next_action)
            next_global_actions.append(indiv_next_action)
            # Soft update of target network
            if self.global_step % cfg.sync_rate == 0:
                agent.target_update()
        next_global_actions = torch.cat(
            [next_actions_i for next_actions_i in next_global_actions], 1)

        if self.on_gpu:
            indiv_reward_batch_i = torch.cuda.FloatTensor(
                indiv_reward_batch_i.float())
            indiv_reward_batch_i = indiv_reward_batch_i.view(
                indiv_reward_batch_i.size(0), 1)
            obs_batch_i = torch.cuda.FloatTensor(obs_batch_i.float())
            global_state_batch = torch.cuda.FloatTensor(
                global_state_batch.float())
            #global_actions_batch = torch.stack(global_actions_batch)
            global_next_state_batch = torch.cuda.FloatTensor(
                global_next_state_batch.float())
        else:
            indiv_reward_batch_i = torch.FloatTensor(indiv_reward_batch_i)
            indiv_reward_batch_i = indiv_reward_batch_i.view(
                indiv_reward_batch_i.size(0), 1)
            obs_batch_i = torch.FloatTensor(obs_batch_i.float())
            global_state_batch = torch.FloatTensor(global_state_batch.float())
            #global_actions_batch = torch.stack(global_actions_batch)
            global_next_state_batch = torch.FloatTensor(
                global_next_state_batch.float())

        if optimizer_idx % 2 == 0:
            # update critic
            curr_Q = self.agents[agent_idx].critic(global_state_batch,
                                                   global_actions_batch)
            next_Q = self.agents[agent_idx].critic_target(
                global_next_state_batch, next_global_actions)
            estimated_Q = indiv_reward_batch_i + cfg.gamma * next_Q

            critic_loss = self.loss_function(curr_Q, estimated_Q)
            torch.nn.utils.clip_grad_norm_(
                self.agents[agent_idx].critic.parameters(), 0.5)

            return {
                'loss': critic_loss,
                'log': {
                    'train_critic_loss': critic_loss
                }
            }

        elif optimizer_idx % 2 == 1:
            policy_loss = -self.agents[agent_idx].critic(
                global_state_batch, global_actions_batch).mean()
            curr_pol_out = self.agents[agent_idx].actor(obs_batch_i)
            policy_loss += -(curr_pol_out**2).mean() * 1e-3
            torch.nn.utils.clip_grad_norm_(
                self.agents[agent_idx].critic.parameters(), 0.5)

            return {
                'loss': policy_loss,
                'log': {
                    'train_policy_loss': policy_loss
                }
            }

    def loss_function(self, curr_Q, estimated_Q):
        criterion = nn.MSELoss()
        loss = criterion(curr_Q, estimated_Q)

        return loss

    def configure_optimizers(self):
        optim_list = list()
        for agent in self.agents:
            optim_list.extend([agent.critic_optimizer, agent.actor_optimizer])

        return optim_list

    def train_dataloader(self):
        dataset = RLDataset(self.replay_buffer, cfg.batch_size)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=cfg.batch_size,
                                num_workers=0)
        return dataloader

    def get_actions(self, states):
        actions = []
        for i in range(self.num_agents):
            action = self.agents[i].get_action(states[i])
            actions.append(action)
        return actions