def _compute_y_and_t(self, exp_batch): batch_state = exp_batch['state'] batch_size = len(batch_state) # Compute Q-values for current states qout = self.q_function(batch_state) batch_actions = exp_batch['action'] batch_q = F.reshape(qout.evaluate_actions(batch_actions), (batch_size, 1)) # Target values must also backprop gradients batch_q_target = F.reshape(self._compute_target_values(exp_batch), (batch_size, 1)) return batch_q, scale_grad.scale_grad(batch_q_target, self.grad_scale)
def _compute_y_and_t(self, exp_batch): batch_state = exp_batch['state'] # Compute Q-values for current states if self.recurrent: qout, _ = self.model.n_step_forward(batch_state, exp_batch['recurrent_state'], output_mode='concat') else: qout = self.model(batch_state) batch_actions = exp_batch['action'] batch_q = qout.evaluate_actions(batch_actions)[..., None] # Target values must also backprop gradients batch_q_target = self._compute_target_values(exp_batch)[..., None] return batch_q, scale_grad.scale_grad(batch_q_target, self.grad_scale)