def run_episode(self): self.her.reset() obs, done = self.env.reset() done = False state = self.env.get_tensor(obs) sum_r = 0 mean_loss = mean_val() min_dist = 100000 max_t = 50 for t in range(max_t): self.steps += 1 self.eps = self.epsi_low + (self.epsi_high - self.epsi_low) * ( np.exp(-1.0 * self.steps / self.decay)) Q = self.model(self.norm(state.cuda())) num = np.random.rand() if (num < self.eps): action = torch.randint(0, Q.shape[1], (1, )).type(torch.LongTensor) else: action = torch.argmax(Q, dim=1) new_obs, reward, done, dist = self.env.step(obs, action.item()) new_state = self.env.get_tensor(new_obs) sum_r = sum_r + reward if dist < min_dist: min_dist = dist if (t + 1) == max_t: done = True self.replay_buffer.append([ dc(state.squeeze(0).numpy()), dc(action), dc(reward), dc(new_state.squeeze(0).numpy()), dc(done) ]) self.her.keep([ state.squeeze(0).numpy(), action, reward, new_state.squeeze(0).numpy(), done ]) loss = self.update_model() mean_loss.append(loss) state = dc(new_state) obs = dc(new_obs) self.step_counter = self.step_counter + 1 if (self.step_counter > self.update_target_step): self.target_model.load_state_dict(self.model.state_dict()) self.step_counter = 0 print('updated target model') her_list = self.her.backward() for item in her_list: self.replay_buffer.append(item) self.log.add_item('tot_return', sum_r) self.log.add_item('avg_loss', mean_loss.get()) self.log.add_item('final_dist', min_dist)
def runEps(self): obs = self.env.reset() sum_r = 0 sum_tot_r = 0 mean_loss = mean_val() for t in range(self.timer): self.steps += 1 #decay epsilon self.eps = self.epsi_low + (self.epsi_high - self.epsi_low) * ( np.exp(-1.0 * self.steps / self.decay)) state = torch.Tensor(obs).unsqueeze(0) Q = self.model(state) num = np.random.rand() if (num < self.eps): action = torch.randint(0, Q.shape[1], (1, )).type(torch.LongTensor) else: action = torch.argmax(Q, dim=1) new_state, reward, done, info = self.env.step(action.item()) sum_r = sum_r + reward reward_i = self.rnd.getReward(state).detach().clamp(-1.0, 1.0).item() combined_reward = reward + self.scale_intrinsic * reward_i sum_tot_r += combined_reward self.replay_buffer.append( [obs, action, combined_reward, new_state, done]) loss = self.update() mean_loss.append(loss) obs = new_state self.step_counter += 1 if (self.step_counter > self.update_target_step): self.target_model.load_state_dict(self.model.state_dict()) self.step_counter = 0 if done: break self.log.add_item('real_return', sum_r) self.log.add_item('combined_return', sum_tot_r) self.log.add_item('avg_loss', mean_loss.get())