def _fit_epoch(self, args, use_weights, kwargs): if self._batch_size > 0: batches = minibatch_generator(self._batch_size, *args) else: batches = [args] loss_current = [] for batch in batches: loss_current.append(self._fit_batch(batch, use_weights, kwargs)) return np.mean(loss_current)
def _update_policy(self, obs, act, adv, old_log_p): for epoch in range(self._n_epochs_policy): for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator( self._batch_size, obs, act, adv, old_log_p): self._optimizer.zero_grad() prob_ratio = torch.exp( self.policy.log_prob_t(obs_i, act_i) - old_log_p_i) clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo, 1 + self._eps_ppo) loss = -torch.mean( torch.min(prob_ratio * adv_i, clipped_ratio * adv_i)) loss.backward() self._optimizer.step()