def test_policy_gradient(self): action = self.mu.sample().array pg = acer.compute_policy_gradient_loss(action, 1, self.pi, self.mu, self.action_value, 0, self.truncation_threshold) print('pg', pg.array) self.assertFalse(np.isnan(np.sum(pg.array)))
def bias_correction_policy_gradients(truncation_threshold): gs = [] for sample in mu_samples: base_policy.cleargrads() loss = acer.compute_policy_gradient_loss( action=sample, advantage=evaluate_action(sample), action_distrib=pi, action_distrib_mu=mu, action_value=action_value, v=0, truncation_threshold=truncation_threshold) F.squeeze(loss).backward() gs.append(extract_gradients_as_single_vector(base_policy)) return gs