def test_zero_step(self):
   self.memory = ReplayMemory(capacity=10, multi_step_n=0)
   for i in range(5):
     a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False)
     self.memory.push(a)
   final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True)
   self.memory.push(final)
   self.assertEqual(self.memory.memory[0].r, 1)
   self.assertEqual(self.memory.memory[3].r, 1)
   self.assertEqual(self.memory.memory[4].r, 1)
   self.assertEqual(self.memory.memory[5].r, 10)
示例#2
0
 def run_episode(self):
     """
     Train an NEC agent for a single episode:
         Interact with environment
         Append (state, action, reward) transitions to transition queue
         Call update at the end of the episode
     """
     if self.epsilon > self.final_epsilon:
         self.epsilon = self.epsilon * self.epsilon_decay
     state = self.env.reset()
     if self.environment_type == 'fourrooms':
         fewest_steps = self.env.shortest_path_length(self.env.state)
     total_steps = 0
     total_reward = 0
     total_frames = 0
     done = False
     while not done:
         state_embedding = torch.tensor(state).permute(2, 0, 1)  # (C,H,W)
         state_embedding = state_embedding.unsqueeze(0).to(self.device)
         state_embedding = self.cnn(state_embedding)
         action = self.choose_action(state_embedding)
         next_state, reward, done, _ = self.env.step(action)
         self.transition_queue.append(Transition(state, action, reward))
         total_reward += reward
         total_frames += self.env.skip
         total_steps += 1
         state = next_state
     self.update()
     if self.environment_type == 'fourrooms':
         n_extra_steps = total_steps - fewest_steps
         return n_extra_steps, total_frames, total_reward
     else:
         return total_frames, total_reward
示例#3
0
    def optimize_model(self):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        next_states_batch = torch.cat(batch.next_state).view(
            self.batch_size, -1).to(self.device)
        state_batch = torch.cat(batch.state).view(self.batch_size,
                                                  -1).to(self.device)
        action_batch = torch.cat(batch.action).view(self.batch_size,
                                                    -1).to(self.device)
        reward_batch = torch.cat(batch.reward).view(self.batch_size,
                                                    -1).to(self.device)

        # Compute loss
        loss = self._compute_loss(state_batch, action_batch, next_states_batch,
                                  reward_batch)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()

        # clip grad
        if self.grad_clip is not None:
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-self.grad_clip, self.grad_clip)

        # update Policy net weights
        self.optimizer.step()

        # update Target net weights
        self._update_target()
  def test_update(self):
    for i in range(10):
      a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True)
      self.memory.push(a)

    self.memory.update([1, 3], [2, 5])
    self.assertEqual(self.memory.errors[1], 2.1)
    self.assertEqual(self.memory.errors[3], 5.1)
示例#5
0
    def optimize_policy_model(self):
        """
        performs a single step of optimization for the policy model
        :return:
        """
        if self.memory.length() < self.batch_size:
            return
        # sample a batch
        transitions = self.memory.sample_batch(self.batch_size)

        one_batch = Transition(*zip(*transitions))

        # create a mask of non-final states
        non_final_mask = torch.tensor(tuple(
            map(lambda s: s is not None, one_batch.next_state)),
                                      device=self.device,
                                      dtype=torch.uint8)  # [128]
        non_final_next_states = torch.cat([
            s for s in one_batch.next_state if s is not None
        ])  # [< 128, 3, 40, 80]

        # concatenate all batch elements into one
        state_batch = torch.cat(one_batch.state)  # [128, 3, 40, 80]
        action_batch = torch.cat(one_batch.action)  # [128, 1]
        reward_batch = torch.cat(one_batch.reward)  # [128]

        state_batch = state_batch.to(self.device)
        non_final_next_states = non_final_next_states.to(self.device)

        curr_state_values = self.policy_model(state_batch)  # [128, 2]
        curr_state_action_values = curr_state_values.gather(
            1, action_batch)  # [128, 1]

        # Get V(s_{t+1}) for all next states. By definition we set V(s)=0 if s is a terminal state.
        next_state_values = torch.zeros(self.batch_size,
                                        device=self.device)  # [128]
        next_state_values[non_final_mask] = self.target_model(
            non_final_next_states).max(1)[0].detach()  # [< 128]

        # Get the expected Q values
        expected_state_action_values = (
            next_state_values * self.config.gamma) + reward_batch  # [128]
        # compute loss: temporal difference error
        loss = self.loss(curr_state_action_values,
                         expected_state_action_values.unsqueeze(1))

        # optimizer step
        self.optim.zero_grad()
        loss.backward()
        for param in self.policy_model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optim.step()

        return loss
  def test_sample(self):
    for i in range(10):
      a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True)
      self.memory.push(a)

    s, a, s1, r, done = self.memory.sample(2)
    self.assertEqual(s.shape, (2, 4))
    self.assertEqual(a.shape, (2, 1))
    self.assertEqual(s1.shape, (2, 4))
    self.assertEqual(r.shape, (2, 1))
    self.assertEqual(done.shape, (2, 1))
示例#7
0
    def warmup(self):
        """
        Warmup the DND with values from an episode with a random policy
        """
        state = self.env.reset()
        total_reward = 0
        total_frames = 0
        done = False
        while not done:
            action = random.randint(0, self.env.action_space.n - 1)
            next_state, reward, done, _ = self.env.step(action)
            total_reward += reward
            total_frames += self.env.skip
            self.transition_queue.append(Transition(state, action, reward))
            state = next_state

        for t in range(len(self.transition_queue)):
            tr = self.transition_queue[t]
            state_embedding = torch.tensor(tr.state).permute(2, 0,
                                                             1)  # (C,H,W)
            state_embedding = state_embedding.unsqueeze(0).to(self.device)
            state_embedding = self.cnn(state_embedding)
            action = tr.action
            dnd = self.dnd_list[action]

            Q_N = self.Q_lookahead(t, True).to(self.device)
            if dnd.keys_to_be_inserted is None and dnd.keys is None:
                dnd.insert(state_embedding, Q_N.detach().unsqueeze(0))
            else:
                embedding_index = dnd.get_index(state_embedding)
                if embedding_index is None:
                    state_embedding = state_embedding.detach()
                    dnd.insert(state_embedding, Q_N.detach().unsqueeze(0))
                else:
                    Q = self.Q_update(dnd.values[embedding_index], Q_N)
                    dnd.update(Q.detach(), embedding_index)
            self.replay_memory.push(tr.state, action, Q_N.detach())
        for dnd in self.dnd_list:
            dnd.commit_insert()
        # Clear out transition queue
        self.transition_queue = []
        return total_frames, total_reward
示例#8
0
def train_policy_with_a_batch(replay_memory, policy, target, batch_size,
                              optimizer, gamma):

    if len(replay_memory) < batch_size * 10:
        return

    transitions = replay_memory.sample(batch_size)
    batch = Transition(*zip(*transitions))

    non_final_mask = torch.tensor(list(
        map(lambda x: x is not None, batch.next_state)),
                                  device=device,
                                  dtype=torch.uint8)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action).unsqueeze(1)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken
    state_action_values = policy(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    next_state_values = torch.zeros(batch_size, device=device)
    next_state_values[non_final_mask] = target(non_final_next_states).max(
        1)[0].detach()

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values,
                            expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
示例#9
0
 def episode(self):
     """
   Train an NEC agent for a single episode
   Interact with environment on-policy and append all (state, action, reward) transitions to transition queue
   Call update at the end of every episode
     """
     if self.epsilon > self.final_epsilon:
         self.epsilon = self.epsilon * self.epsilon_decay
     state = self.env.reset()
     total_reward = 0
     done = False
     while not done:
         state_embedding = self.embedding_network(
             Variable(Tensor(state)).unsqueeze(0))
         #action = self.choose_action(state_embedding)
         next_state, reward, done, action = self.env.step()
         self.transition_queue.append(Transition(state, action, reward))
         total_reward += reward
         state = next_state
     self.update()
     return total_reward
示例#10
0
 def warmup(self):
     """
   Warmup the DND with values from an episode with a random policy
     """
     state = self.env.reset()
     total_reward = 0
     done = False
     while not done:
         #action = random.randint(0, self.env.action_space_n - 1)
         next_state, reward, done, action = self.env.step()
         total_reward += reward
         self.transition_queue.append(Transition(state, action, reward))
         state = next_state
 
     for t in range(len(self.transition_queue)):
         transition = self.transition_queue[t]
         state = Variable(Tensor(transition.state)).unsqueeze(0)
         action = transition.action
         state_embedding = self.embedding_network(state)
         dnd = self.dnd_list[action]
 
         Q_N = self.Q_lookahead(t, True)
         if dnd.keys_to_be_inserted is None and dnd.keys is None:
             dnd.insert(state_embedding, Q_N.detach().unsqueeze(0))
         else:
             embedding_index = dnd.get_index(state_embedding)
             if embedding_index is None:
                 dnd.insert(state_embedding.detach(), Q_N.detach().unsqueeze(0))
             else:
                 Q = self.Q_update(dnd.values[embedding_index], Q_N)
                 dnd.update(Q.detach(), embedding_index)
         self.replay_memory.push(transition.state, action, Q_N)
     [dnd.commit_insert() for dnd in self.dnd_list]
       # Clear out transition queue
     self.transition_queue = []
     return total_reward
示例#11
0
            mask = torch.Tensor([done]).to(device)
            reward = torch.Tensor([reward]).to(device)
            next_state = torch.Tensor([next_state]).to(device)

            memory.push(state, action, mask, next_state, reward)

            state = next_state

            epoch_value_loss = 0
            epoch_policy_loss = 0

            if len(memory) > args.batch_size:
                transitions = memory.sample(args.batch_size)
                # Transpose the batch
                # (see http://stackoverflow.com/a/19343/3343043 for detailed explanation).
                batch = Transition(*zip(*transitions))

                # Update actor and critic according to the batch
                value_loss, policy_loss = agent.update_params(batch)

                epoch_value_loss += value_loss
                epoch_policy_loss += policy_loss

            if done:
                break

        rewards.append(epoch_return)
        value_losses.append(epoch_value_loss)
        policy_losses.append(epoch_policy_loss)
        writer.add_scalar('epoch/return', epoch_return, epoch)
示例#12
0
    def run_loop(self, env, max_frames=0):
        """A run loop to have agents and an environment interact."""
        total_frames = 0
        start_time = time.time()

        action_spec = env.action_spec()
        observation_spec = env.observation_spec()

        self.setup(observation_spec, action_spec)

        try:
            while True:
                obs = env.reset()[0]
                # remove unit selection from the equation by selecting the friendly on every new game.
                select_friendly = self.select_friendly_action(obs)
                obs = env.step([select_friendly])[0]
                # distance = self.get_reward(obs.observation["screen"])

                self.reset()

                while True:
                    total_frames += 1

                    self._screen = obs.observation["screen"][5]
                    s = np.expand_dims(obs.observation["screen"][5], 0)
                    # plt.imshow(s[5])
                    # plt.pause(0.00001)
                    if max_frames and total_frames >= max_frames:
                        print("max frames reached")
                        return
                    if obs.last():
                        print("total frames:", total_frames, "Epsilon:",
                              self._epsilon.value())
                        self._epsilon.increment()
                        break

                    action = self.get_action(s)
                    env_actions = self.get_env_action(action, obs)
                    obs = env.step([env_actions])[0]

                    r = obs.reward
                    s1 = np.expand_dims(obs.observation["screen"][5], 0)
                    done = r > 0
                    if self._epsilon.isTraining:
                        transition = Transition(s, action, s1, r, done)
                        self._memory.push(transition)

                    if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.train_q()
                        # pass

                    if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self._Qt = copy.deepcopy(self._Q)
                        self.show_chart()

                    if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining:
                        self.show_chart()

                    if not self._epsilon.isTraining and total_frames % 3 == 0:
                        self.show_chart()

        except KeyboardInterrupt:
            pass
        finally:
            print("finished")
            elapsed_time = time.time() - start_time
            print("Took %.3f seconds for %s steps: %.3f fps" %
                  (elapsed_time, total_frames, total_frames / elapsed_time))
示例#13
0
 def load_game_from_replay_memory():
     transitions = replay_memory.sample(batch_size)
     batch = Transition(*zip(*transitions))
     return batch
 def test_append(self):
   for i in range(20):
     a = Transition([0, 1, 2, 3], 0, [4, 5, 6, 7], 0, True)
     self.memory.push(a)
   self.assertEqual(len(self.memory.memory), 10)