def test_uncalibrated_agents(self): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'], ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2) env = Mdp(mdp) agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50) agent1.set_mdp(mdp) actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [e, e, e, n, e, e, e, e, e, s, stay, stay, stay]) agent2 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=5) agent2.set_mdp(mdp) actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13) self.assertEqual( actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay]) agent3 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=0.5) agent3.set_mdp(mdp) actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])
def test_myopic_agent(self): grid = [ 'XXXXXXXX', 'XA X', 'X XXXX9X', 'X X', 'X X2 X', 'XXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20) optimal_agent.set_mdp(mdp) actions, _ = self.run_on_env(optimal_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay]) myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20) myopic_agent.set_mdp(mdp) actions, _ = self.run_on_env(myopic_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])
def test_optimal_agent(self): agent = agents.OptimalAgent(gamma=0.95, num_iters=20) self.optimal_agent_test(agent)
def test_value_iteration(self): agent1 = agents.OptimalAgent(gamma=0.95, num_iters=20) agent2 = ValueIterationAgent(gamma=0.95, num_iters=20) self.compare_agents('soft value iteration', agent1, agent2, places=2)
def optimal_agent_test(self, agent): grid = [ 'XXXXXXXXX', 'X9X6XA X', 'X X X XXX', 'X 2X', 'XXXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10) self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay]) # Same thing, but with a bigger discount mdp = GridworldMdp(grid, living_reward=-0.001) env = Mdp(mdp) agent = agents.OptimalAgent(gamma=0.5, num_iters=20) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Values # Inaccurate because I ignore living reward and we only use 20 # iterations of value iteration, so only check to 2 places self.assertAlmostEqual(agent.value(start_state), 0.25, places=2) # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, reward = self.run_on_env(agent, env, gamma=0.5, episode_length=10) # Again approximate comparison since we don't consider living rewards self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2) self.assertEqual(actions, [s, s, e, e, stay, stay, stay, stay, stay, stay]) # Same thing, but with Boltzmann rationality agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20) agent.set_mdp(mdp) # Action distribution dist = agent.get_action_distribution(start_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, wprob) self.assertTrue(sprob > nprob) self.assertTrue(nprob > eprob) middle_state = (2, 3) dist = agent.get_action_distribution(middle_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, sprob) self.assertTrue(wprob > eprob) self.assertTrue(eprob > nprob)
def test_compare_optimal_agents(self): agent1 = agents.OptimalAgent(gamma=0.95, num_iters=20) agent2 = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) self.compare_agents('optimal', agent1, agent2, print_mdp=True)