예제 #1
0
        team = 'HELIOS19'
    elif team == 'robocin':
        team = 'RoboCIn'
    agent = DDPGAgent(DDPG,
                      False,
                      team=team,
                      port=6000,
                      num_agents=int(num_agents),
                      num_ops=int(num_agents))
    processes = []
    agent.ddpg.actor.share_memory()
    agent.ddpg.target_actor.share_memory()
    agent.ddpg.critic.share_memory()
    agent.ddpg.target_critic.share_memory()
    if agent.gen_mem:
        memories = [MemoryDeque(MEM_SIZE) for _ in range(agent.num_agents)]
    else:
        memories = agent.ddpg.memory
    agent.ddpg.memory = None
    for rank in range(agent.num_agents):
        p = mp.Process(target=run,
                       args=(agent.port, agent.team, agent.actions,
                             agent.rewards, rank, agent.ddpg, memories[rank],
                             agent.test, episodes))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
    agent.save_model(bye=True)
    exit(1)
예제 #2
0
class DDPGTrain(BaseTrain):
    def __init__(self, static_policy=False, env=None,
                 config=None):
        super(DDPGTrain, self).__init__(config=config, env=env)

        self.priority_replay = config.USE_PRIORITY_REPLAY

        self.gamma = config.GAMMA
        self.lr = config.LR
        self.experience_replay_size = config.EXP_REPLAY_SIZE
        self.batch_size = config.BATCH_SIZE
        self.learn_start = config.LEARN_START
        self.priority_beta_start = config.PRIORITY_BETA_START
        self.priority_beta_frames = config.PRIORITY_BETA_FRAMES
        self.priority_alpha = config.PRIORITY_ALPHA
        self.tau = config.tau
        self.static_policy = static_policy
        self.num_feats = env.observation_space.shape
        self.env = env
        # self.writer = SummaryWriter(
        #     f'./saved_agents/DDPG/agent_{self.env.getUnum()}')
        self.declare_networks()
        actor_learning_rate = 1e-4
        critic_learning_rate = 1e-3
        self.num_actor_update_iteration = 0
        self.num_critic_update_iteration = 0
        self.critic_criterion = nn.MSELoss()
        self.actor_optimizer = optim.Adam(
            self.actor.parameters(), lr=actor_learning_rate)
        self.critic_optimizer = optim.Adam(
            self.critic.parameters(), lr=critic_learning_rate)
        self.actor_loss = self.critic_loss = list()

        # move to correct device
        self.actor = self.actor.to(self.device)
        self.target_actor = self.target_actor.to(self.device)
        self.critic = self.critic.to(self.device)
        self.target_critic = self.target_critic.to(self.device)

        if self.static_policy:
            self.actor.eval()
            self.target_actor.eval()
            self.critic.eval()
            self.target_critic.eval()

        self.update_count = 0

        self.declare_memory()

        self.nsteps = config.N_STEPS
        self.nstep_buffer = []

    def save_w(self, path_models=('./saved_agents/actor.dump',
                                  './saved_agents/critic.dump'),
               path_optims=('./saved_agents/actor_optim.dump',
                            './saved_agents/critic_optim.dump')):
        torch.save(self.actor.state_dict(), path_models[0])
        torch.save(self.critic.state_dict(), path_models[1])
        torch.save(self.actor_optimizer.state_dict(), path_optims[0])
        torch.save(self.critic_optimizer.state_dict(), path_optims[1])

    def load_w(self, path_models=('./saved_agents/actor.dump',
                                  './saved_agents/critic.dump'),
               path_optims=('./saved_agents/actor_optim.dump',
                            './saved_agents/critic_optim.dump')):
        fname_actor = path_models[0]
        fname_critic = path_models[1]
        fname_actor_optim = path_optims[0]
        fname_critic_optim = path_optims[1]

        if os.path.isfile(fname_actor):
            self.actor.load_state_dict(torch.load(fname_actor,
                                                  map_location=self.device))
            for target_param, param in zip(self.target_actor.parameters(),
                                           self.actor.parameters()):
                target_param.data.copy_(param.data)

        if os.path.isfile(fname_critic):
            self.critic.load_state_dict(torch.load(fname_critic,
                                                   map_location=self.device))
            for target_param, param in zip(self.target_critic.parameters(),
                                           self.critic.parameters()):
                target_param.data.copy_(param.data)

        if os.path.isfile(fname_actor_optim):
            self.actor_optimizer.load_state_dict(
                torch.load(fname_actor_optim,
                           map_location=self.device)
            )

        if os.path.isfile(fname_critic_optim):
            self.critic_optimizer.load_state_dict(torch.load(
                fname_critic_optim,
                map_location=self.device)
            )

    def declare_networks(self):
        pass

    def declare_memory(self):
        self.memory = MemoryDeque(self.experience_replay_size)

    def append_to_replay(self, s, a, r, s_, d):
        self.memory.store((s, a, r, s_, d))

    def update(self):  # faster
        state, next_state, action, reward, done = self.memory.sample(
            self.batch_size)
        reward = reward.reshape(-1, 1)
        done = done.reshape(-1, 1)
        num_feat = state.shape[1] * state.shape[2]
        state = Variable(torch.FloatTensor(
            np.float32(state))).view(self.batch_size, num_feat)
        next_state = Variable(torch.FloatTensor(
            np.float32(next_state))).view(self.batch_size, num_feat)
        action = Variable(torch.FloatTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))

        # Compute the target Q value
        acts = self.target_actor(next_state)
        target_Q = self.target_critic(next_state, acts)
        target_Q = reward + (self.gamma * target_Q * (1 - done)).detach()

        # Get current Q estimate
        current_Q = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q, target_Q)
        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        # self.writer.add_scalar('Loss/ddpg/critic_loss', critic_loss,
        #                     global_step=self.num_critic_update_iteration)
        # self.critic_loss.append(critic_loss)

        # Compute actor loss
        acts = self.actor(state)
        actor_loss = -self.critic(state, acts).mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        # self.writer.add_scalar(
        #     'Loss/ddpg/actor_loss', actor_loss, global_step=self.num_actor_update_iteration)
        # self.actor_loss.append(actor_loss)

        self.num_actor_update_iteration += 1
        self.num_critic_update_iteration += 1
        # Update the frozen target models
        if self.num_critic_update_iteration % 1000 == 0:
            for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()):
                target_param.data.copy_(
                    self.tau * param.data + (1 - self.tau) * target_param.data)

        if self.num_actor_update_iteration % 1000 == 0:
            for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()):
                target_param.data.copy_(
                    self.tau * param.data + (1 - self.tau) * target_param.data)

    def get_action(self, s):
        with torch.no_grad():
            num_feat = s.shape[0] * s.shape[1]
            state = Variable(torch.from_numpy(s).float().unsqueeze(0))
            state = state.view(1, num_feat)
            action = self.actor.forward(state)
            action = action.detach().cpu().numpy()[0, 0]
            return action
예제 #3
0
 def declare_memory(self):
     if not self.priority_replay:
         self.memory = MemoryDeque(self.experience_replay_size)
     else:
         self.memory = Memory(int(self.experience_replay_size))
예제 #4
0
 def declare_memory(self):
     self.memory = MemoryDeque(self.experience_replay_size)
예제 #5
0
class DuelingTrain(BaseTrain):
    def __init__(self, static_policy=False, env=None,
                 config=None):
        super(DuelingTrain, self).__init__(config=config, env=env)
        self.noisy = config.USE_NOISY_NETS
        self.priority_replay = config.USE_PRIORITY_REPLAY

        self.gamma = config.GAMMA
        self.lr = config.LR
        self.target_net_update_freq = config.TARGET_NET_UPDATE_FREQ
        self.experience_replay_size = config.EXP_REPLAY_SIZE
        self.batch_size = config.BATCH_SIZE
        self.update_freq = config.UPDATE_FREQ
        self.sigma_init = config.SIGMA_INIT
        self.priority_beta_start = config.PRIORITY_BETA_START
        self.priority_beta_frames = config.PRIORITY_BETA_FRAMES
        self.priority_alpha = config.PRIORITY_ALPHA
        self.tau = config.tau

        self.static_policy = static_policy
        self.num_actions = env.action_space.n
        self.env = env

        self.declare_networks()
        self.writer = SummaryWriter(
            f'./saved_agents/DDQN/agent_{self.env.getUnum()}')
        self.losses = list()

        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        # move to correct device
        self.model = self.model.to(self.device)
        self.target_model.to(self.device)

        if self.static_policy:
            self.model.eval()
            self.target_model.eval()

        self.update_count = 0
        self.update_iteration = 0

        self.declare_memory()

        self.nsteps = config.N_STEPS
        self.nstep_buffer = []

    def load_w(self, model_path=None, optim_path=None):
        if model_path is None:
            fname_model = "./saved_agents/model.dump"
        else:
            fname_model = model_path
        if optim_path is None:
            fname_optim = "./saved_agents/optim.dump"
        else:
            fname_optim = optim_path

        if os.path.isfile(fname_model):
            self.model.load_state_dict(torch.load(
                fname_model, map_location=self.device))
            self.target_model.load_state_dict(self.model.state_dict())

        if os.path.isfile(fname_optim):
            self.optimizer.load_state_dict(
                torch.load(fname_optim, map_location=self.device))

    def declare_networks(self):
        pass

    def declare_memory(self):
        self.memory = MemoryDeque(self.experience_replay_size)

    def append_to_replay(self, s, a, r, s_, d):
        self.memory.store((s, a, r, s_, d))

    def compute_td_loss(self):
        state, next_state, action, reward, done = self.memory.sample(
            self.batch_size)
        num_feat = state.shape[1] * state.shape[2]
        state = Variable(torch.FloatTensor(
            np.float32(state))).view(64, num_feat)
        next_state = Variable(torch.FloatTensor(
            np.float32(next_state))).view(64, num_feat)
        action = Variable(torch.LongTensor(action))
        reward = Variable(torch.FloatTensor(reward))
        done = Variable(torch.FloatTensor(done))
        q_values = self.model(state)
        next_q_values = self.target_model(next_state)

        q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_values.max(1)[0]
        expected_q_value = reward + self.gamma * next_q_value * (1 - done)

        loss = (q_value - expected_q_value.detach()).pow(2).mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss

    def update(self, frame=0):
        loss = self.compute_td_loss()
        unum = self.env.getUnum()
        self.writer.add_scalar(
            f'Loss/loss_{unum}', loss, global_step=self.update_iteration)
        self.losses.append(loss)
        self.update_iteration += 1

        self.update_target_model()

    def get_action(self, s, eps=0.1):  # faster
        with torch.no_grad():
            if np.random.uniform() >= eps or self.static_policy or self.noisy:
                X = torch.tensor([s], device=self.device, dtype=torch.float)
                X = X.view(1, -1)
                out = self.model(X)
                maxout = out.argmax()
                return maxout.item()
            else:
                return np.random.randint(0, self.num_actions)

    def update_target_model(self):
        self.update_count += 1
        self.update_count = self.update_count % self.target_net_update_freq
        if self.update_count == 0:
            for param, target_param in zip(self.model.parameters(), self.target_model.parameters()):
                target_param.data.copy_(
                    self.tau * param.data + (1 - self.tau) * target_param.data)