Пример #1
0
class DQNAgentPER(DQNAgentBase):
    def __init__(self,
                 net,
                 target_net,
                 alpha=0.6,
                 beta=0.4,
                 beta_delta=1.001,
                 e=1e-8,
                 **kwargs):
        super(DQNAgentPER, self).__init__(net, target_net, **kwargs)
        self.memory = PrioritizedReplayBuffer(**kwargs)
        self.__alpha = alpha
        self.__beta = beta
        self.__beta_delta = beta_delta
        self.__e = e

    def _learn(self, samples):
        states, actions, rewards, next_states, dones, idxs, probs = samples
        expected_q_values = self.net(states, training=True).gather(1, actions)
        # DQN target
        target_q_values_next = self.target_net(
            next_states, training=True).detach().max(1)[0].unsqueeze(1)
        target_q_values = rewards + self.gamma * target_q_values_next * (1 -
                                                                         dones)
        td_err = expected_q_values - target_q_values  # calc td error
        weights = (probs * self.memory.size()).pow(-self.__beta).to(
            self.device)
        weights = weights / weights.max()
        loss = torch.mean(td_err.pow(2).squeeze() * weights)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.memory.update(
            idxs.cpu().numpy(),
            td_err.abs().detach().cpu().numpy().squeeze()**self.__alpha +
            self.__e)
        return loss.detach().cpu().numpy()

    def step(self, state, action, reward, next_state, done):
        loss = super(DQNAgentPER, self).step(state, action, reward, next_state,
                                             done)
        if done:
            self.__beta = min(1., self.__beta * self.__beta_delta)
        return loss
Пример #2
0
                if args.render: env.render()
                # choose action
                if np.random.rand() <= epsilon:
                    a = model.randomAction()
                else:
                    a = model.predictAction(s)
                # anneal epsilon
                epsilon = max(0.2, epsilon - epsilon_step)
                # apply action, get rewards and new state s2
                s2_text, r, terminal, info = env.step(a)
                s2 = sent2seq(s2_text, seq_len)
                # add current exp to buffer
                replay_buffer.add(s, a, r, terminal, s2)
                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if ((replay_buffer.size() > args.batch_size)
                        and (step_ctr % args.rounds_per_learn == 0)):
                    s_batch, a_batch, r_batch, t_batch, s2_batch = \
                        replay_buffer.sample_batch(args.batch_size)
                    # Update the networks each given the new target values
                    l = model.trainOnBatch(s_batch, a_batch, r_batch, t_batch,
                                           s2_batch)
                    loss += l
                    step_ctr = 0

                s = s2
                ep_reward += r
                cnt_invalid_actions += 1 if r == -0.1 else 0
                if terminal: break

            ep_lens.append(j + 1)