def test_seed_not_set(self): seed = 123 env1 = TestEnv() env1.seed(seed) brain1 = TestBrain(self._n_states, self._n_actions) acting1 = EpsGreedyPolicy(self._eps) replay_memory1 = Memory(self._capacity, self._batch_size) agent1 = TdAgent(self._n_episodes, env1, brain1, acting1, replay_memory1, self._gamma) rewards1, _ = agent1.run() env2 = TestEnv() env2.seed(seed) brain2 = TestBrain(self._n_states, self._n_actions) acting2 = EpsGreedyPolicy(self._eps) replay_memory2 = Memory(self._capacity, self._batch_size) agent2 = TdAgent(self._n_episodes, env2, brain2, acting2, replay_memory2, self._gamma) rewards2, _ = agent2.run() # assert rewards1 != rewards2 diff = rewards1 - rewards2 diff = np.abs(diff) diff = np.sum(diff) self.assertGreater(diff, 0)
def test_seed_set(self): seed = 123 env1 = TestEnv() env1.seed(seed) brain1 = TestBrain(self._n_states, self._n_actions) acting1 = EpsGreedyPolicy(self._eps) replay_memory1 = Memory(self._capacity, self._batch_size) agent1 = TdAgent(self._n_episodes, env1, brain1, acting1, replay_memory1, self._gamma) agent1.seed(seed) rewards1, _ = agent1.run() rewards1 = rewards1.tolist() env2 = TestEnv() env2.seed(seed) brain2 = TestBrain(self._n_states, self._n_actions) acting2 = EpsGreedyPolicy(self._eps) replay_memory2 = Memory(self._capacity, self._batch_size) agent2 = TdAgent(self._n_episodes, env2, brain2, acting2, replay_memory2, self._gamma) agent2.seed(seed) rewards2, _ = agent2.run() rewards2 = rewards2.tolist() self.assertListEqual(rewards1, rewards2)
class EpsGreedyPolicyTest(unittest.TestCase): def setUp(self): self._eps = 0.1 self._acting = EpsGreedyPolicy(self._eps) def test_seed(self): seed = 123 eps = 0.7 lowest_q = 1 highest_q = 10 n_actions = 10 n_qs = 1000 acting1 = EpsGreedyPolicy(eps) acting1.seed(seed) acting2 = EpsGreedyPolicy(eps) acting2.seed(seed) qs = [ np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) for _ in range(n_qs) ] for q in qs: a1 = acting1.act(q) a2 = acting2.act(q) self.assertEqual(a1, a2) def test_act(self): n_total = 10000 lowest_q = 1 highest_q = 10 n_actions = 100 n_max_q = 0 n_random = 0 for _ in range(n_total): q = np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) arg_max = np.argmax(q) action = self._acting.act(q) if arg_max == action: n_max_q += 1 else: n_random += 1 actual = n_random / n_total max_deviation = 0.1 actual_deviation = abs((self._eps - actual) / self._eps) self.assertLess(actual_deviation, max_deviation)
def test_seed(self): seed = 123 eps = 0.7 lowest_q = 1 highest_q = 10 n_actions = 10 n_qs = 1000 acting1 = EpsGreedyPolicy(eps) acting1.seed(seed) acting2 = EpsGreedyPolicy(eps) acting2.seed(seed) qs = [ np.random.uniform(lowest_q, highest_q, size=(1, n_actions)) for _ in range(n_qs) ] for q in qs: a1 = acting1.act(q) a2 = acting2.act(q) self.assertEqual(a1, a2)
def setUp(self): self._n_episodes = 10 self._env = TestEnv() self._n_states = 3 self._n_actions = 7 self._brain = TestBrain(self._n_states, self._n_actions) self._eps = 0.1 self._acting = EpsGreedyPolicy(self._eps) self._capacity = 1000 self._batch_size = 8 self._replay_memory = Memory(self._capacity, self._batch_size) self._gamma = 0.99 self._train_freq = 4 self._agent = TdAgent(self._n_episodes, self._env, self._brain, self._acting, self._replay_memory, self._gamma, train_freq=self._train_freq)
def setUp(self): self._eps = 0.1 self._acting = EpsGreedyPolicy(self._eps)