def test_uncalibrated_agents(self): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'], ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2) env = Mdp(mdp) agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50) agent1.set_mdp(mdp) actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [e, e, e, n, e, e, e, e, e, s, stay, stay, stay]) agent2 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=5) agent2.set_mdp(mdp) actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13) self.assertEqual( actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay]) agent3 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=0.5) agent3.set_mdp(mdp) actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])
def test_myopic_agent(self): grid = [ 'XXXXXXXX', 'XA X', 'X XXXX9X', 'X X', 'X X2 X', 'XXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20) optimal_agent.set_mdp(mdp) actions, _ = self.run_on_env(optimal_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay]) myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20) myopic_agent.set_mdp(mdp) actions, _ = self.run_on_env(myopic_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])
def get_policy(agent, grid): """Returns the policy of the agent given""" from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) return action_dists
def show_agents(grids, agent_list, agent_names, grid_names, filename='AgentComparison', figtitle=''): """Shows how agents perform on a gridworld grid - list of gridworlds (see examples in earlier part of file) agent_list - list of agent (objects) agent_names - names of agents (strings) """ num_ex = len(agent_list) num_grids = len(grids) fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5)) if num_grids == 1: axes_grid = [axes_grid] for i, axes in enumerate(axes_grid): # Give each gridworld a name (uncomment to do so) # ax.set_ylabel(grid_names[i]) # Generate MDP grid = grids[i] mdp = GridworldMdp(grid, noise=0.2) walls, reward, start = mdp.convert_to_numpy_input() for idx, agent in enumerate(agent_list): ax = axes[idx] ax.set_aspect('equal') plot_reward(reward, walls, '', fig=fig, ax=ax) plot_trajectory(walls, reward, start, agent, arrow_width=0.35, fig=fig, ax=ax) # Only write Agent names if it's the first row if i == 0: ax.set_title(agent_names[idx], fontname='Times New Roman', fontsize=16) print('Agent {} is {}'.format(agent_names[idx], agent)) # Increase vertical space btwn subplots # fig.subplots_adjust(hspace=0.2) # fig.suptitle(figtitle) fig.savefig(filename, bbox_inches='tight', dpi=500) print("Saved figure to {}.png".format(filename))
def test_visitations(grid, agent): """Tests the expected_counts calculation--might be einsum error""" # print("Testing expected_counts") from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() # print("Start state for given mdp:", start_state) start = start_state trans = mdp.get_transition_matrix() initial_states = np.zeros((len(grid), len(grid))) initial_states[start[1]][start[0]] = 1 initial_states = initial_states.reshape(-1) policy = flatten_policy(action_dists) demo_counts = expected_counts(policy, trans, initial_states, 20, 0.9) import matplotlib.pyplot as plt plt.imsave("democounts", demo_counts.reshape((len(grid), len(grid))))
def main(): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2) env = Mdp(mdp) opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20) over = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=5) under = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=0.5) agents = [opt, naive, soph, myopic, over, under] names = [ 'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident', 'Underconfident' ] for name, agent in zip(names, agents): print('{} agent'.format(name)) agent.set_mdp(mdp) trajectory = run_agent(agent, env, episode_length=50, determinism=True) if agent == naive: print([a for _, a, _, _ in trajectory]) print_training_example(mdp, trajectory) print(opt.values.T)
def test_irl(grid, agent): from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() # print("Start state for given mdp:", start_state) inferred = irl_wrapper(walls, action_dists, start_state, 20, 0.9) # print("---true below---") # print(rewards) return walls, start_state, inferred, rewards
def test_coherence(grid, agent): """Test that these arrays perform as expected under np.einsum""" from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() print("Start state for given mdp:", start_state) # inferred = _irl_wrapper(walls, action_dists, start_state, 20, 1.0) start = start_state trans = mdp.get_transition_matrix() initial_states = np.zeros((len(grid), len(grid))) initial_states[start[1]][start[0]] = 1 initial_states = initial_states.reshape(-1) policy = flatten_policy(action_dists) gshape = (len(grid), len(grid)) print("initial states") print('-' * 20) print(initial_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", initial_states, policy, trans) # next_states = (next_states.reshape(gshape).T).reshape(-1) print("first expected counts") print('-' * 20) print(next_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) print("second expected counts") print('-' * 20) print(next_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) # next_states = (next_states.reshape(gshape).T).reshape(-1) print("third expected counts") print('-' * 20) print(next_states.reshape(gshape)) # for i in range(5): # next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) # # next_states = (next_states.reshape(gshape).T).reshape(-1) # print("{}th expected counts".format(4+i)) # print('-'*20) # print(next_states.reshape(gshape)) return next_states.reshape((len(grid), len(grid)))
def optimal_agent_test(self, agent): grid = [ 'XXXXXXXXX', 'X9X6XA X', 'X X X XXX', 'X 2X', 'XXXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10) self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay]) # Same thing, but with a bigger discount mdp = GridworldMdp(grid, living_reward=-0.001) env = Mdp(mdp) agent = agents.OptimalAgent(gamma=0.5, num_iters=20) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Values # Inaccurate because I ignore living reward and we only use 20 # iterations of value iteration, so only check to 2 places self.assertAlmostEqual(agent.value(start_state), 0.25, places=2) # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, reward = self.run_on_env(agent, env, gamma=0.5, episode_length=10) # Again approximate comparison since we don't consider living rewards self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2) self.assertEqual(actions, [s, s, e, e, stay, stay, stay, stay, stay, stay]) # Same thing, but with Boltzmann rationality agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20) agent.set_mdp(mdp) # Action distribution dist = agent.get_action_distribution(start_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, wprob) self.assertTrue(sprob > nprob) self.assertTrue(nprob > eprob) middle_state = (2, 3) dist = agent.get_action_distribution(middle_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, sprob) self.assertTrue(wprob > eprob) self.assertTrue(eprob > nprob)