예제 #1
0
 def update_policy(self, log_probs, cur_r, memory):
     '''Imitate the poicy update of the learner'''
     if self.alg == "vpg":
         vpg_update(self.im_optimizer, log_probs, cur_r, memory.is_terminals, self.gamma)
     elif self.alg == "ppo":
         ppo_update(self.im_policy, self.im_optimizer, log_probs, cur_r, memory, 
                    self.gamma, self.K_epochs, self.eps_clip, self.loss_fn, self.device)
예제 #2
0
 def learning(self, memory):
     if self.alg == "vpg":
         old_states = torch.stack(memory.states).to(self.device).detach()
         old_actions = torch.stack(memory.actions).to(self.device).detach()        
         logprobs = self.policy.act_prob(old_states, old_actions, self.device)       
         vpg_update(self.optimizer, logprobs, memory.rewards, memory.is_terminals, self.gamma)
     elif self.alg == "ppo":
         old_states = torch.stack(memory.states).to(self.device).detach()
         old_actions = torch.stack(memory.actions).to(self.device).detach() 
         logprobs = self.policy.act_prob(old_states, old_actions, self.device)
         ppo_update(self.policy, self.optimizer, logprobs, memory.rewards, 
                memory, self.gamma, self.K_epochs, self.eps_clip, self.loss_fn, self.device)