class DQNAgentPER(DQNAgentBase): def __init__(self, net, target_net, alpha=0.6, beta=0.4, beta_delta=1.001, e=1e-8, **kwargs): super(DQNAgentPER, self).__init__(net, target_net, **kwargs) self.memory = PrioritizedReplayBuffer(**kwargs) self.__alpha = alpha self.__beta = beta self.__beta_delta = beta_delta self.__e = e def _learn(self, samples): states, actions, rewards, next_states, dones, idxs, probs = samples expected_q_values = self.net(states, training=True).gather(1, actions) # DQN target target_q_values_next = self.target_net( next_states, training=True).detach().max(1)[0].unsqueeze(1) target_q_values = rewards + self.gamma * target_q_values_next * (1 - dones) td_err = expected_q_values - target_q_values # calc td error weights = (probs * self.memory.size()).pow(-self.__beta).to( self.device) weights = weights / weights.max() loss = torch.mean(td_err.pow(2).squeeze() * weights) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.memory.update( idxs.cpu().numpy(), td_err.abs().detach().cpu().numpy().squeeze()**self.__alpha + self.__e) return loss.detach().cpu().numpy() def step(self, state, action, reward, next_state, done): loss = super(DQNAgentPER, self).step(state, action, reward, next_state, done) if done: self.__beta = min(1., self.__beta * self.__beta_delta) return loss
if args.render: env.render() # choose action if np.random.rand() <= epsilon: a = model.randomAction() else: a = model.predictAction(s) # anneal epsilon epsilon = max(0.2, epsilon - epsilon_step) # apply action, get rewards and new state s2 s2_text, r, terminal, info = env.step(a) s2 = sent2seq(s2_text, seq_len) # add current exp to buffer replay_buffer.add(s, a, r, terminal, s2) # Keep adding experience to the memory until # there are at least minibatch size samples if ((replay_buffer.size() > args.batch_size) and (step_ctr % args.rounds_per_learn == 0)): s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(args.batch_size) # Update the networks each given the new target values l = model.trainOnBatch(s_batch, a_batch, r_batch, t_batch, s2_batch) loss += l step_ctr = 0 s = s2 ep_reward += r cnt_invalid_actions += 1 if r == -0.1 else 0 if terminal: break ep_lens.append(j + 1)