def train(self): loss = np.array([0., 0., 0., 0.]) for ep in self.recorder: loss += self._train( ep.screen_input[:ep.current_step], ep.action_input[:ep.current_step], ep.unit_input[:ep.current_step], get_discounted_rewards(ep.rewards[:ep.current_step], discount_rate=DISCOUNT_RATE), ep.nonspatial_action[:ep.current_step], ep.spatial_action[:ep.current_step], ep.screen_used[:ep.current_step]) return loss / len(self.recorder)
def get_next_batch(self, env): for _ in range(C.NUM_EPOCHS): epoch_logits = torch.empty(size=(0, self.action_space_size), device=self.DEVICE) epoch_weighted_log_probs = torch.empty(size=(0, ), dtype=torch.float, device=self.DEVICE) total_rewards = deque([], maxlen=C.BATCH_SIZE_PER_THREAD) episode_counter = 0 while episode_counter < C.BATCH_SIZE_PER_THREAD: episode_counter += 1 # reset the environment to a random initial state every epoch state = env.reset() # initialize the episode arrays episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=self.DEVICE) episode_logits = torch.empty(size=(0, C.action_space_size), device=self.DEVICE) average_rewards = np.empty(shape=(0, ), dtype=np.float) episode_rewards = np.empty(shape=(0, ), dtype=np.float) # episode loop for step_index in range(0, C.max_simulation_length): # get the action logits from the agent - (preferences) action_logits = self.m( torch.tensor(state).float().unsqueeze(dim=0).to( self.DEVICE)) # append the logits to the episode logits list episode_logits = torch.cat((episode_logits, action_logits), dim=0) # sample an action according to the action distribution action = Categorical(logits=action_logits).sample() # append the action to the episode action list to obtain the trajectory # we need to store the actions and logits so we could calculate the gradient of the performance episode_actions = torch.cat((episode_actions, action), dim=0) # take the chosen action, observe the reward and the next state state, reward, done, _ = env.step( action=action.cpu().item()) # append the reward to the rewards pool that we collect during the episode # we need the rewards so we can calculate the weights for the policy gradient # and the baseline of average episode_rewards = np.concatenate( (episode_rewards, np.array([reward])), axis=0) # here the average reward is state specific average_rewards = np.concatenate( (average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0) # turn the rewards we accumulated during the episode into the rewards-to-go: # earlier actions are responsible for more rewards than the later taken actions discounted_rewards_to_go = utils.get_discounted_rewards( rewards=episode_rewards, gamma=C.GAMMA) discounted_rewards_to_go -= average_rewards # baseline - state specific average # calculate the sum of the rewards for the running average metric sum_of_rewards = np.sum(episode_rewards) # after each episode append the sum of total rewards to the deque total_rewards.append(sum_of_rewards) # set the mask for the actions taken in the episode mask = one_hot(episode_actions, num_classes=C.action_space_size) # calculate the log-probabilities of the taken actions # mask is needed to filter out log-probabilities of not related logits episode_log_probs = torch.sum( mask.float() * log_softmax(episode_logits, dim=1), dim=1) # weight the episode log-probabilities by the rewards-to-go episode_weighted_log_probs = episode_log_probs * \ torch.tensor(discounted_rewards_to_go).float().to(self.DEVICE) # calculate the sum over trajectory of the weighted log-probabilities sum_weighted_log_probs = torch.sum( episode_weighted_log_probs).unsqueeze(dim=0) # append the weighted log-probabilities of actions epoch_weighted_log_probs = torch.cat( (epoch_weighted_log_probs, sum_weighted_log_probs), dim=0) # append the logits - needed for the entropy bonus calculation epoch_logits = torch.cat((epoch_logits, episode_logits), dim=0) # calculate the loss loss, entropy = utils.calculate_loss( C.BETA, epoch_logits=epoch_logits, weighted_log_probs=epoch_weighted_log_probs) yield loss, total_rewards
def play_episode(environment, device, action_space_size, agent, gamma, episode: int): """ Plays an episode of the environment. episode: the episode counter Returns: sum_weighted_log_probs: the sum of the log-prob of an action multiplied by the reward-to-go from that state episode_logits: the logits of every step of the episode - needed to compute entropy for entropy bonus finished_rendering_this_epoch: pass-through rendering flag sum_of_rewards: sum of the rewards for the episode - needed for the average over 200 episode statistic """ agent.to('cpu') device = 'cpu' # reset the environment to a random initial state every epoch state = environment.reset() # initialize the episode arrays episode_actions = torch.empty(size=(0, ), dtype=torch.long, device=device) episode_logits = torch.empty(size=(0, action_space_size), device=device) average_rewards = np.empty(shape=(0, ), dtype=np.float) episode_rewards = np.empty(shape=(0, ), dtype=np.float) # episode loop while True: # get the action logits from the agent - (preferences) action_logits = agent( torch.tensor(state).float().unsqueeze(dim=0).to(device)) #print('action logits is',action_logits) # append the logits to the episode logits list episode_logits = torch.cat((episode_logits, action_logits), dim=0) # sample an action according to the action distribution action = Categorical(logits=action_logits).sample() #print('the action after categorical is',action) # append the action to the episode action list to obtain the trajectory # we need to store the actions and logits so we could calculate the gradient of the performance episode_actions = torch.cat((episode_actions, action), dim=0) # take the chosen action, observe the reward and the next state state, reward, done, _ = environment.step(action=action.cpu().item()) # append the reward to the rewards pool that we collect during the episode # we need the rewards so we can calculate the weights for the policy gradient # and the baseline of average episode_rewards = np.concatenate((episode_rewards, np.array([reward])), axis=0) # here the average reward is state specific average_rewards = np.concatenate( (average_rewards, np.expand_dims(np.mean(episode_rewards), axis=0)), axis=0) # the episode is over if done: # increment the episode episode += 1 # turn the rewards we accumulated during the episode into the rewards-to-go: # earlier actions are responsible for more rewards than the later taken actions discounted_rewards_to_go = utils.get_discounted_rewards( rewards=episode_rewards, gamma=gamma) discounted_rewards_to_go -= average_rewards # baseline - state specific average # # calculate the sum of the rewards for the running average metric sum_of_rewards = np.sum(episode_rewards) # set the mask for the actions taken in the episode mask = one_hot(episode_actions, num_classes=environment.action_space.n) # calculate the log-probabilities of the taken actions # mask is needed to filter out log-probabilities of not related logits episode_log_probs = torch.sum(mask.float() * log_softmax(episode_logits, dim=1), dim=1) # weight the episode log-probabilities by the rewards-to-go episode_weighted_log_probs = episode_log_probs * \ torch.tensor(discounted_rewards_to_go).float().to(device) # calculate the sum over trajectory of the weighted log-probabilities sum_weighted_log_probs = torch.sum( episode_weighted_log_probs).unsqueeze(dim=0) sum_weighted_log_probs = sum_weighted_log_probs.to('cpu') episode_logits = episode_logits.to('cpu') sum_weighted_log_probs = sum_weighted_log_probs.to(device) episode_logits = episode_logits.to(device) return sum_weighted_log_probs, episode_logits, sum_of_rewards, episode