def test_get_approximate_entropy(self): # (2, 4+1, 4) log_probs = np.array([[ [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.4), np.log(0.1), np.log(0.4), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], ], [ [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.1), np.log(0.4), np.log(0.4)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], ]]) # (2, 4) mask = np.array([ [1, 1, 0, 0], [1, 1, 1, 0] ]) # Removing the last time-step and the masked stuff, gets us this. filtered_log_probs = np.array([[ [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.4), np.log(0.1), np.log(0.4), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.1), np.log(0.4), np.log(0.4)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], ]]) self.assertNear(ppo.approximate_entropy(log_probs, mask), -np.sum(filtered_log_probs) / 5.0, 1e-6)
def test_approximate_kl(self): # (2, 4+1, 4) p_old = np.array([[ [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.4), np.log(0.1), np.log(0.4), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], ], [ [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.1), np.log(0.4), np.log(0.4)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.2), np.log(0.6), np.log(0.1)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], ]]) # (2, 4+1, 4) p_new = np.array([[ [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.4), np.log(0.1), np.log(0.1), np.log(0.3)], [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)], [np.log(0.3), np.log(0.1), np.log(0.5), np.log(0.1)], [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)], ], [ [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)], [np.log(0.1), np.log(0.1), np.log(0.2), np.log(0.6)], [np.log(0.3), np.log(0.1), np.log(0.3), np.log(0.3)], [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)], [np.log(0.1), np.log(0.2), np.log(0.1), np.log(0.6)], ]]) # (2, 4) mask = np.array([ [1, 1, 0, 0], [1, 1, 1, 0] ]) self.assertNear( ppo.approximate_kl(p_new, p_old, mask), -ppo.approximate_entropy(p_old, mask) + ppo.approximate_entropy(p_new, mask), 1e-6)