示例#1
0
    def run_episode(self):
        self.her.reset()
        obs, done = self.env.reset()
        done = False
        state = self.env.get_tensor(obs)
        sum_r = 0
        mean_loss = mean_val()
        min_dist = 100000
        max_t = 50

        for t in range(max_t):
            self.steps += 1
            self.eps = self.epsi_low + (self.epsi_high - self.epsi_low) * (
                np.exp(-1.0 * self.steps / self.decay))
            Q = self.model(self.norm(state.cuda()))
            num = np.random.rand()
            if (num < self.eps):
                action = torch.randint(0, Q.shape[1],
                                       (1, )).type(torch.LongTensor)
            else:
                action = torch.argmax(Q, dim=1)
            new_obs, reward, done, dist = self.env.step(obs, action.item())
            new_state = self.env.get_tensor(new_obs)
            sum_r = sum_r + reward
            if dist < min_dist:
                min_dist = dist
            if (t + 1) == max_t:
                done = True

            self.replay_buffer.append([
                dc(state.squeeze(0).numpy()),
                dc(action),
                dc(reward),
                dc(new_state.squeeze(0).numpy()),
                dc(done)
            ])
            self.her.keep([
                state.squeeze(0).numpy(), action, reward,
                new_state.squeeze(0).numpy(), done
            ])
            loss = self.update_model()
            mean_loss.append(loss)
            state = dc(new_state)
            obs = dc(new_obs)

            self.step_counter = self.step_counter + 1
            if (self.step_counter > self.update_target_step):
                self.target_model.load_state_dict(self.model.state_dict())
                self.step_counter = 0
                print('updated target model')

        her_list = self.her.backward()

        for item in her_list:
            self.replay_buffer.append(item)

        self.log.add_item('tot_return', sum_r)
        self.log.add_item('avg_loss', mean_loss.get())
        self.log.add_item('final_dist', min_dist)
示例#2
0
    def runEps(self):
        obs = self.env.reset()

        sum_r = 0
        sum_tot_r = 0
        mean_loss = mean_val()

        for t in range(self.timer):
            self.steps += 1
            #decay epsilon
            self.eps = self.epsi_low + (self.epsi_high - self.epsi_low) * (
                np.exp(-1.0 * self.steps / self.decay))
            state = torch.Tensor(obs).unsqueeze(0)
            Q = self.model(state)
            num = np.random.rand()

            if (num < self.eps):
                action = torch.randint(0, Q.shape[1],
                                       (1, )).type(torch.LongTensor)
            else:
                action = torch.argmax(Q, dim=1)

            new_state, reward, done, info = self.env.step(action.item())
            sum_r = sum_r + reward
            reward_i = self.rnd.getReward(state).detach().clamp(-1.0,
                                                                1.0).item()
            combined_reward = reward + self.scale_intrinsic * reward_i
            sum_tot_r += combined_reward

            self.replay_buffer.append(
                [obs, action, combined_reward, new_state, done])

            loss = self.update()
            mean_loss.append(loss)
            obs = new_state

            self.step_counter += 1
            if (self.step_counter > self.update_target_step):
                self.target_model.load_state_dict(self.model.state_dict())
                self.step_counter = 0

            if done:
                break
        self.log.add_item('real_return', sum_r)
        self.log.add_item('combined_return', sum_tot_r)
        self.log.add_item('avg_loss', mean_loss.get())