class EpsGreedyPolicyTest(unittest.TestCase): def setUp(self): self._eps = 0.1 self._acting = EpsGreedyPolicy(self._eps) def test_seed(self): seed = 123 eps = 0.7 lowest_q = 1 highest_q = 10 n_actions = 10 n_qs = 1000 acting1 = EpsGreedyPolicy(eps) acting1.seed(seed) acting2 = EpsGreedyPolicy(eps) acting2.seed(seed) qs = [ np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) for _ in range(n_qs) ] for q in qs: a1 = acting1.act(q) a2 = acting2.act(q) self.assertEqual(a1, a2) def test_act(self): n_total = 10000 lowest_q = 1 highest_q = 10 n_actions = 100 n_max_q = 0 n_random = 0 for _ in range(n_total): q = np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) arg_max = np.argmax(q) action = self._acting.act(q) if arg_max == action: n_max_q += 1 else: n_random += 1 actual = n_random / n_total max_deviation = 0.1 actual_deviation = abs((self._eps - actual) / self._eps) self.assertLess(actual_deviation, max_deviation)
def test_seed(self): seed = 123 eps = 0.7 lowest_q = 1 highest_q = 10 n_actions = 10 n_qs = 1000 acting1 = EpsGreedyPolicy(eps) acting1.seed(seed) acting2 = EpsGreedyPolicy(eps) acting2.seed(seed) qs = [ np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) for _ in range(n_qs) ] for q in qs: a1 = acting1.act(q) a2 = acting2.act(q) self.assertEqual(a1, a2)