예제 #1
0
 def train(self):
     loss = np.array([0., 0., 0., 0.])
     for ep in self.recorder:
         loss += self._train(
             ep.screen_input[:ep.current_step],
             ep.action_input[:ep.current_step],
             ep.unit_input[:ep.current_step],
             get_discounted_rewards(ep.rewards[:ep.current_step],
                                    discount_rate=DISCOUNT_RATE),
             ep.nonspatial_action[:ep.current_step],
             ep.spatial_action[:ep.current_step],
             ep.screen_used[:ep.current_step])
     return loss / len(self.recorder)
예제 #2
0
    def get_next_batch(self, env):

        for _ in range(C.NUM_EPOCHS):

            epoch_logits = torch.empty(size=(0, self.action_space_size),
                                       device=self.DEVICE)
            epoch_weighted_log_probs = torch.empty(size=(0, ),
                                                   dtype=torch.float,
                                                   device=self.DEVICE)
            total_rewards = deque([], maxlen=C.BATCH_SIZE_PER_THREAD)

            episode_counter = 0

            while episode_counter < C.BATCH_SIZE_PER_THREAD:

                episode_counter += 1

                # reset the environment to a random initial state every epoch
                state = env.reset()

                # initialize the episode arrays
                episode_actions = torch.empty(size=(0, ),
                                              dtype=torch.long,
                                              device=self.DEVICE)
                episode_logits = torch.empty(size=(0, C.action_space_size),
                                             device=self.DEVICE)
                average_rewards = np.empty(shape=(0, ), dtype=np.float)
                episode_rewards = np.empty(shape=(0, ), dtype=np.float)

                # episode loop
                for step_index in range(0, C.max_simulation_length):

                    # get the action logits from the agent - (preferences)
                    action_logits = self.m(
                        torch.tensor(state).float().unsqueeze(dim=0).to(
                            self.DEVICE))

                    # append the logits to the episode logits list
                    episode_logits = torch.cat((episode_logits, action_logits),
                                               dim=0)

                    # sample an action according to the action distribution
                    action = Categorical(logits=action_logits).sample()

                    # append the action to the episode action list to obtain the trajectory
                    # we need to store the actions and logits so we could calculate the gradient of the performance
                    episode_actions = torch.cat((episode_actions, action),
                                                dim=0)

                    # take the chosen action, observe the reward and the next state
                    state, reward, done, _ = env.step(
                        action=action.cpu().item())

                    # append the reward to the rewards pool that we collect during the episode
                    # we need the rewards so we can calculate the weights for the policy gradient
                    # and the baseline of average
                    episode_rewards = np.concatenate(
                        (episode_rewards, np.array([reward])), axis=0)

                    # here the average reward is state specific
                    average_rewards = np.concatenate(
                        (average_rewards,
                         np.expand_dims(np.mean(episode_rewards), axis=0)),
                        axis=0)

                # turn the rewards we accumulated during the episode into the rewards-to-go:
                # earlier actions are responsible for more rewards than the later taken actions
                discounted_rewards_to_go = utils.get_discounted_rewards(
                    rewards=episode_rewards, gamma=C.GAMMA)
                discounted_rewards_to_go -= average_rewards  # baseline - state specific average

                # calculate the sum of the rewards for the running average metric
                sum_of_rewards = np.sum(episode_rewards)

                # after each episode append the sum of total rewards to the deque
                total_rewards.append(sum_of_rewards)

                # set the mask for the actions taken in the episode
                mask = one_hot(episode_actions,
                               num_classes=C.action_space_size)

                # calculate the log-probabilities of the taken actions
                # mask is needed to filter out log-probabilities of not related logits
                episode_log_probs = torch.sum(
                    mask.float() * log_softmax(episode_logits, dim=1), dim=1)

                # weight the episode log-probabilities by the rewards-to-go
                episode_weighted_log_probs = episode_log_probs * \
                    torch.tensor(discounted_rewards_to_go).float().to(self.DEVICE)

                # calculate the sum over trajectory of the weighted log-probabilities
                sum_weighted_log_probs = torch.sum(
                    episode_weighted_log_probs).unsqueeze(dim=0)

                # append the weighted log-probabilities of actions
                epoch_weighted_log_probs = torch.cat(
                    (epoch_weighted_log_probs, sum_weighted_log_probs), dim=0)

                # append the logits - needed for the entropy bonus calculation
                epoch_logits = torch.cat((epoch_logits, episode_logits), dim=0)

                # calculate the loss
                loss, entropy = utils.calculate_loss(
                    C.BETA,
                    epoch_logits=epoch_logits,
                    weighted_log_probs=epoch_weighted_log_probs)

            yield loss, total_rewards
예제 #3
0
def play_episode(environment, device, action_space_size, agent, gamma,
                 episode: int):
    """
            Plays an episode of the environment.
            episode: the episode counter
            Returns:
                sum_weighted_log_probs: the sum of the log-prob of an action multiplied by the reward-to-go from that state
                episode_logits: the logits of every step of the episode - needed to compute entropy for entropy bonus
                finished_rendering_this_epoch: pass-through rendering flag
                sum_of_rewards: sum of the rewards for the episode - needed for the average over 200 episode statistic
        """

    agent.to('cpu')
    device = 'cpu'

    # reset the environment to a random initial state every epoch
    state = environment.reset()

    # initialize the episode arrays
    episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=device)
    episode_logits = torch.empty(size=(0, action_space_size), device=device)
    average_rewards = np.empty(shape=(0, ), dtype=np.float)
    episode_rewards = np.empty(shape=(0, ), dtype=np.float)

    # episode loop
    while True:

        # get the action logits from the agent - (preferences)
        action_logits = agent(
            torch.tensor(state).float().unsqueeze(dim=0).to(device))

        #print('action logits is',action_logits)

        # append the logits to the episode logits list
        episode_logits = torch.cat((episode_logits, action_logits), dim=0)

        # sample an action according to the action distribution
        action = Categorical(logits=action_logits).sample()

        #print('the action after categorical is',action)

        # append the action to the episode action list to obtain the trajectory
        # we need to store the actions and logits so we could calculate the gradient of the performance
        episode_actions = torch.cat((episode_actions, action), dim=0)

        # take the chosen action, observe the reward and the next state
        state, reward, done, _ = environment.step(action=action.cpu().item())

        # append the reward to the rewards pool that we collect during the episode
        # we need the rewards so we can calculate the weights for the policy gradient
        # and the baseline of average
        episode_rewards = np.concatenate((episode_rewards, np.array([reward])),
                                         axis=0)

        # here the average reward is state specific
        average_rewards = np.concatenate(
            (average_rewards, np.expand_dims(np.mean(episode_rewards),
                                             axis=0)),
            axis=0)

        # the episode is over
        if done:

            # increment the episode
            episode += 1

            # turn the rewards we accumulated during the episode into the rewards-to-go:
            # earlier actions are responsible for more rewards than the later taken actions
            discounted_rewards_to_go = utils.get_discounted_rewards(
                rewards=episode_rewards, gamma=gamma)
            discounted_rewards_to_go -= average_rewards  # baseline - state specific average

            # # calculate the sum of the rewards for the running average metric
            sum_of_rewards = np.sum(episode_rewards)

            # set the mask for the actions taken in the episode
            mask = one_hot(episode_actions,
                           num_classes=environment.action_space.n)

            # calculate the log-probabilities of the taken actions
            # mask is needed to filter out log-probabilities of not related logits
            episode_log_probs = torch.sum(mask.float() *
                                          log_softmax(episode_logits, dim=1),
                                          dim=1)

            # weight the episode log-probabilities by the rewards-to-go
            episode_weighted_log_probs = episode_log_probs * \
                torch.tensor(discounted_rewards_to_go).float().to(device)

            # calculate the sum over trajectory of the weighted log-probabilities
            sum_weighted_log_probs = torch.sum(
                episode_weighted_log_probs).unsqueeze(dim=0)

            sum_weighted_log_probs = sum_weighted_log_probs.to('cpu')
            episode_logits = episode_logits.to('cpu')

            sum_weighted_log_probs = sum_weighted_log_probs.to(device)
            episode_logits = episode_logits.to(device)

            return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode