Пример #1
0
	def _fit_epoch(self, args, use_weights, kwargs):
		if self._batch_size > 0:
			batches = minibatch_generator(self._batch_size, *args)
		else:
			batches = [args]
		loss_current = []
		for batch in batches:
			loss_current.append(self._fit_batch(batch, use_weights, kwargs))
		
		return np.mean(loss_current)
Пример #2
0
 def _update_policy(self, obs, act, adv, old_log_p):
     for epoch in range(self._n_epochs_policy):
         for obs_i, act_i, adv_i, old_log_p_i in minibatch_generator(
                 self._batch_size, obs, act, adv, old_log_p):
             self._optimizer.zero_grad()
             prob_ratio = torch.exp(
                 self.policy.log_prob_t(obs_i, act_i) - old_log_p_i)
             clipped_ratio = torch.clamp(prob_ratio, 1 - self._eps_ppo,
                                         1 + self._eps_ppo)
             loss = -torch.mean(
                 torch.min(prob_ratio * adv_i, clipped_ratio * adv_i))
             loss.backward()
             self._optimizer.step()