def update_policy(self, log_probs, cur_r, memory): '''Imitate the poicy update of the learner''' if self.alg == "vpg": vpg_update(self.im_optimizer, log_probs, cur_r, memory.is_terminals, self.gamma) elif self.alg == "ppo": ppo_update(self.im_policy, self.im_optimizer, log_probs, cur_r, memory, self.gamma, self.K_epochs, self.eps_clip, self.loss_fn, self.device)
def learning(self, memory): if self.alg == "vpg": old_states = torch.stack(memory.states).to(self.device).detach() old_actions = torch.stack(memory.actions).to(self.device).detach() logprobs = self.policy.act_prob(old_states, old_actions, self.device) vpg_update(self.optimizer, logprobs, memory.rewards, memory.is_terminals, self.gamma) elif self.alg == "ppo": old_states = torch.stack(memory.states).to(self.device).detach() old_actions = torch.stack(memory.actions).to(self.device).detach() logprobs = self.policy.act_prob(old_states, old_actions, self.device) ppo_update(self.policy, self.optimizer, logprobs, memory.rewards, memory, self.gamma, self.K_epochs, self.eps_clip, self.loss_fn, self.device)