def show_agents(grids, agent_list, agent_names, grid_names, filename='AgentComparison', figtitle=''): """Shows how agents perform on a gridworld grid - list of gridworlds (see examples in earlier part of file) agent_list - list of agent (objects) agent_names - names of agents (strings) """ num_ex = len(agent_list) num_grids = len(grids) fig, axes_grid = plt.subplots(num_grids, num_ex, figsize=(14.0, 4.5)) if num_grids == 1: axes_grid = [axes_grid] for i, axes in enumerate(axes_grid): # Give each gridworld a name (uncomment to do so) # ax.set_ylabel(grid_names[i]) # Generate MDP grid = grids[i] mdp = GridworldMdp(grid, noise=0.2) walls, reward, start = mdp.convert_to_numpy_input() for idx, agent in enumerate(agent_list): ax = axes[idx] ax.set_aspect('equal') plot_reward(reward, walls, '', fig=fig, ax=ax) plot_trajectory(walls, reward, start, agent, arrow_width=0.35, fig=fig, ax=ax) # Only write Agent names if it's the first row if i == 0: ax.set_title(agent_names[idx], fontname='Times New Roman', fontsize=16) print('Agent {} is {}'.format(agent_names[idx], agent)) # Increase vertical space btwn subplots # fig.subplots_adjust(hspace=0.2) # fig.suptitle(figtitle) fig.savefig(filename, bbox_inches='tight', dpi=500) print("Saved figure to {}.png".format(filename))
def test_uncalibrated_agents(self): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', -9, ' ', 'X', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 3, 'X'], ['X', ' ', ' ', 'X', -9, -9, -9, -9, -9, ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1, noise=0.2) env = Mdp(mdp) agent1 = agents.OptimalAgent(gamma=0.9, num_iters=50) agent1.set_mdp(mdp) actions, _ = self.run_on_env(agent1, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [e, e, e, n, e, e, e, e, e, s, stay, stay, stay]) agent2 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=5) agent2.set_mdp(mdp) actions, _ = self.run_on_env(agent2, env, gamma=0.9, episode_length=13) self.assertEqual( actions, [e, e, e, e, e, e, e, e, stay, stay, stay, stay, stay]) agent3 = agents.UncalibratedAgent(gamma=0.9, num_iters=20, calibration_factor=0.5) agent3.set_mdp(mdp) actions, _ = self.run_on_env(agent3, env, gamma=0.9, episode_length=13) self.assertEqual(actions, [s, e, n, e, e, n, e, e, e, e, e, s, stay])
def test_myopic_agent(self): grid = [ 'XXXXXXXX', 'XA X', 'X XXXX9X', 'X X', 'X X2 X', 'XXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) optimal_agent = agents.OptimalAgent(gamma=0.9, num_iters=20) optimal_agent.set_mdp(mdp) actions, _ = self.run_on_env(optimal_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [e, e, e, e, e, s, stay, stay, stay, stay]) myopic_agent = agents.MyopicAgent(6, gamma=0.9, num_iters=20) myopic_agent.set_mdp(mdp) actions, _ = self.run_on_env(myopic_agent, env, gamma=0.9, episode_length=10) self.assertEqual(actions, [s, s, e, e, e, e, e, n, stay, stay])
def get_policy(agent, grid): """Returns the policy of the agent given""" from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) return action_dists
def random_gridworld_plot(agent, other_agent, size, filename='RandomGrid'): """Plots random gridworld""" from gridworld.gridworld import Direction from utils import Distribution if agent is None: raise ValueError("agent cannot be None") num_R = 5 mdp = GridworldMdp.generate_random_connected(size, size, num_R, noise=0) walls, reward, start = mdp.convert_to_numpy_input() def get_policy(agent): num_actions = 5 imsize = len(walls) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) agent.set_mdp(mdp) action_dists = [[action((x, y)) for x in range(imsize)] for y in range(imsize)] action_dists = np.array(action_dists) return action_dists fig, axes = plt.subplots(1, 1) fig.set_size_inches(5, 5) # Reward only plot_reward(reward, np.zeros_like(walls), fig=fig, ax=axes, ax_title='') fig.savefig(filename + 'R', bbox_inches='tight', dpi=100) # Walls only plot_reward(np.zeros_like(reward), walls, fig=fig, ax=axes, ax_title='') fig.savefig(filename + 'W', bbox_inches='tight', dpi=100) # Trajectory + Walls + Rewards plot_reward(reward, walls, fig=fig, ax=axes, ax_title='') # plot_trajectory(walls, reward, start, agent, fig=fig, ax=axes) policy = get_policy(agent) plot_policy(walls, policy, fig=fig, ax=axes) fig.savefig(filename + 'Ptrue', bbox_inches='tight', dpi=100) axes.clear() plot_reward(reward, walls, fig=fig, ax=axes, ax_title='') predicted = get_policy(other_agent) plot_policy_diff(predicted, policy, walls, fig=fig, ax=axes) fig.savefig(filename + 'Ppredicted', bbox_inches='tight', dpi=100)
def test_visitations(grid, agent): """Tests the expected_counts calculation--might be einsum error""" # print("Testing expected_counts") from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() # print("Start state for given mdp:", start_state) start = start_state trans = mdp.get_transition_matrix() initial_states = np.zeros((len(grid), len(grid))) initial_states[start[1]][start[0]] = 1 initial_states = initial_states.reshape(-1) policy = flatten_policy(action_dists) demo_counts = expected_counts(policy, trans, initial_states, 20, 0.9) import matplotlib.pyplot as plt plt.imsave("democounts", demo_counts.reshape((len(grid), len(grid))))
def test_trajectory_plotting(): """Tests trajectory plotting""" from agents import MyopicAgent, OptimalAgent from gridworld.gridworld import GridworldMdp agent = OptimalAgent() mdp = GridworldMdp.generate_random(12, 12, pr_wall=0.1, pr_reward=0.1) agent.set_mdp(mdp) walls, reward, start = mdp.convert_to_numpy_input() myopic = MyopicAgent(horizon=10) _plot_reward_and_trajectories_helper( reward, reward, walls, start, myopic, OptimalAgent(), filename="trajectory.png" )
def compare_agents(self, name, agent1, agent2, places=7, print_mdp=False): print('Comparing {0} agents'.format(name)) set_seeds(314159) mdp = GridworldMdp.generate_random_connected(16, 16, 5, 0.2) if print_mdp: print(mdp) env = Mdp(mdp) self.time(lambda: agent1.set_mdp(mdp), "Python planner") self.time(lambda: agent2.set_mdp(mdp), "Numpy/Tensorflow planner") for s in mdp.get_states(): for a in mdp.get_actions(s): mu = agent1.extend_state_to_mu(s) qval1, qval2 = agent1.qvalue(mu, a), agent2.qvalue(mu, a) self.assertAlmostEqual(qval1, qval2, places=places)
def test_irl(grid, agent): from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) # I think it's this line that's wrong. Writing as (y, x) gives expected SVF vec action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() # print("Start state for given mdp:", start_state) inferred = irl_wrapper(walls, action_dists, start_state, 20, 0.9) # print("---true below---") # print(rewards) return walls, start_state, inferred, rewards
def evaluate_proxy(walls, start_state, proxy_reward, true_reward, gamma=0.9, episode_length=float("inf")): """Runs agent on a proxy environment for one episode, while collecting true reward from a separate environment walls: Numpy array of walls, where each entry is 1 or 0 start_state: Starting state for the agent proxy_reward: Numpy array of reward values true_reward: Numpy array of reward values Creates a proxy mdp by overlaying walls onto proxy grid. True reward is summed if the reward grid's entry at the given state can be casted to a float Returns sum of proxy reward / sum of true reward. Which is related to regret. """ proxy_mdp = GridworldMdp.from_numpy_input(walls, proxy_reward, start_state) true_mdp = GridworldMdp.from_numpy_input(walls, true_reward, start_state) env = Mdp(true_mdp) proxy_agent = FastOptimalAgent() proxy_agent.set_mdp(true_mdp, proxy_mdp) proxy_trajectory = run_agent(proxy_agent, env, episode_length) reward_from_proxy_agent = get_reward_from_trajectory( proxy_trajectory, gamma) true_agent = FastOptimalAgent() true_agent.set_mdp(true_mdp) true_trajectory = run_agent(true_agent, env, episode_length) reward_from_true_agent = get_reward_from_trajectory(true_trajectory, gamma) if reward_from_true_agent == 0: # TODO(rohinmshah): Figure out why this can happen, and come up with a # better solution than this hack return (1.0 + reward_from_proxy_agent) / (1.0 + reward_from_true_agent) return float(reward_from_proxy_agent) / reward_from_true_agent
def main(): grid = [['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X'], ['X', ' ', -90, -90, -90, -90, '8', ' ', 'X'], ['X', 'A', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', -99, '2', ' ', ' ', ' ', 'X'], ['X', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', ' ', '1', ' ', ' ', ' ', ' ', ' ', 'X'], ['X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X']] mdp = GridworldMdp(grid, living_reward=-0.01, noise=0.2) env = Mdp(mdp) opt = fast_agents.FastOptimalAgent(gamma=0.95, num_iters=20) naive = NaiveTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) soph = fast_agents.FastSophisticatedTimeDiscountingAgent(10, 1, gamma=0.95, num_iters=20) myopic = fast_agents.FastMyopicAgent(6, gamma=0.95, num_iters=20) over = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=5) under = fast_agents.FastUncalibratedAgent(gamma=0.95, num_iters=20, calibration_factor=0.5) agents = [opt, naive, soph, myopic, over, under] names = [ 'Optimal', 'Naive', 'Sophisticated', 'Myopic', 'Overconfident', 'Underconfident' ] for name, agent in zip(names, agents): print('{} agent'.format(name)) agent.set_mdp(mdp) trajectory = run_agent(agent, env, episode_length=50, determinism=True) if agent == naive: print([a for _, a, _, _ in trajectory]) print_training_example(mdp, trajectory) print(opt.values.T)
def plot_trajectory( wall, reward, start, agent, fig, ax, arrow_width=0.5, EPISODE_LENGTH=35, animate=False, fname=None, ): """Simulates a rollout of an agent given an MDP specified by the wall, reward, and start state. And plots it. If animate is true, an animation object will be returned """ from agent_runner import run_agent from gridworld.gridworld import GridworldMdp from mdp_interface import Mdp mdp = GridworldMdp.from_numpy_input(wall, reward, start) agent.set_mdp(mdp) env = Mdp(mdp) trajectory = run_agent(agent, env, episode_length=EPISODE_LENGTH, determinism=True) if len(trajectory) <= 1: raise ValueError("Trajectory rolled out unsuccessfully") # Tuples of (state, next) - to be used for plotting state_trans = [(info[0], info[2]) for info in trajectory] count = 0 for trans in state_trans: if trans[0] == trans[1]: count += 1 if count == len(state_trans): print( "Yes, the agent given stayed in the same spot for {} iterations...".format( len(state_trans) ) ) if fig is None or ax is None: fig, ax = plt.subplots(1, 1) if ax is not None and type(ax) is list: raise ValueError("Given {} axes, but can only use 1 axis".format(len(ax))) # Plot starting point plot_pos(start, ax=ax, color="k", marker="o", grid_size=len(wall)) # Plot ending trajectory point finish = state_trans[-1][0] plot_pos(finish, ax=ax, color="k", marker="*", grid_size=len(wall)) plot_lines( ax, fig, trans_list=state_trans, color="black", arrow_width=arrow_width, grid_size=len(wall), animate=animate, fname=fname, ) ax.set_xticks([]) ax.set_yticks([]) return fig, ax
def test_coherence(grid, agent): """Test that these arrays perform as expected under np.einsum""" from gridworld.gridworld import GridworldMdp, Direction from utils import Distribution num_actions = len(Direction.ALL_DIRECTIONS) mdp = GridworldMdp(grid=grid) agent.set_mdp(mdp) def dist_to_numpy(dist): return dist.as_numpy_array(Direction.get_number_from_direction, num_actions) def action(state): # Walls are invalid states and the MDP will refuse to give an action for # them. However, the VIN's architecture requires it to provide an action # distribution for walls too, so hardcode it to always be STAY. x, y = state if mdp.walls[y][x]: return dist_to_numpy(Distribution({Direction.STAY: 1})) return dist_to_numpy(agent.get_action_distribution(state)) imsize = len(grid) action_dists = [[action((x, y)) for y in range(imsize)] for x in range(imsize)] action_dists = np.array(action_dists) walls, rewards, start_state = mdp.convert_to_numpy_input() print("Start state for given mdp:", start_state) # inferred = _irl_wrapper(walls, action_dists, start_state, 20, 1.0) start = start_state trans = mdp.get_transition_matrix() initial_states = np.zeros((len(grid), len(grid))) initial_states[start[1]][start[0]] = 1 initial_states = initial_states.reshape(-1) policy = flatten_policy(action_dists) gshape = (len(grid), len(grid)) print("initial states") print('-' * 20) print(initial_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", initial_states, policy, trans) # next_states = (next_states.reshape(gshape).T).reshape(-1) print("first expected counts") print('-' * 20) print(next_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) print("second expected counts") print('-' * 20) print(next_states.reshape(gshape)) next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) # next_states = (next_states.reshape(gshape).T).reshape(-1) print("third expected counts") print('-' * 20) print(next_states.reshape(gshape)) # for i in range(5): # next_states = np.einsum("i,ij,ijk -> k", next_states, policy, trans) # # next_states = (next_states.reshape(gshape).T).reshape(-1) # print("{}th expected counts".format(4+i)) # print('-'*20) # print(next_states.reshape(gshape)) return next_states.reshape((len(grid), len(grid)))
def optimal_agent_test(self, agent): grid = [ 'XXXXXXXXX', 'X9X6XA X', 'X X X XXX', 'X 2X', 'XXXXXXXXX' ] n, s, e, w, stay = self.all_actions mdp = GridworldMdp(grid, living_reward=-0.1) env = Mdp(mdp) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, _ = self.run_on_env(agent, env, gamma=0.95, episode_length=10) self.assertEqual(actions, [s, s, w, w, w, w, n, n, stay, stay]) # Same thing, but with a bigger discount mdp = GridworldMdp(grid, living_reward=-0.001) env = Mdp(mdp) agent = agents.OptimalAgent(gamma=0.5, num_iters=20) agent.set_mdp(mdp) start_state = mdp.get_start_state() # Values # Inaccurate because I ignore living reward and we only use 20 # iterations of value iteration, so only check to 2 places self.assertAlmostEqual(agent.value(start_state), 0.25, places=2) # Action distribution action_dist = agent.get_action_distribution(start_state) self.assertEqual(action_dist, Distribution({s: 1})) # Trajectory actions, reward = self.run_on_env(agent, env, gamma=0.5, episode_length=10) # Again approximate comparison since we don't consider living rewards self.assertAlmostEqual(reward, (4 - 0.0625) / 16, places=2) self.assertEqual(actions, [s, s, e, e, stay, stay, stay, stay, stay, stay]) # Same thing, but with Boltzmann rationality agent = agents.OptimalAgent(beta=1, gamma=0.5, num_iters=20) agent.set_mdp(mdp) # Action distribution dist = agent.get_action_distribution(start_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, wprob) self.assertTrue(sprob > nprob) self.assertTrue(nprob > eprob) middle_state = (2, 3) dist = agent.get_action_distribution(middle_state).get_dict() nprob, sprob, eprob, wprob = dist[n], dist[s], dist[e], dist[w] for p in [nprob, sprob, eprob, wprob]: self.assertTrue(0 < p < 1) self.assertEqual(nprob, sprob) self.assertTrue(wprob > eprob) self.assertTrue(eprob > nprob)
from model import tf_value_iter_no_config from agents import OptimalAgent from gridworld.gridworld import GridworldMdp sess = tf.InteractiveSession() walls = [[1, 1, 1, 1, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]] reward = [[0, 0, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]] walls = np.array(walls) reward = np.array(reward) agent_start = (1, 3) mdp = GridworldMdp.from_numpy_input(walls.astype(np.float32), reward.astype(np.float32), start_state=agent_start) imsize = walls.shape[0] discount = 0.9 num_iters = 50 def test_model(wall_tf, reward_tf, alg): return alg(wall_tf, reward_tf) def tf_value_iter_model(wall_tf, reward_tf): a = tf.reshape(wall_tf, [1, imsize, imsize]) b = tf.reshape(reward_tf, [1, imsize, imsize]) X = tf.stack([a, b], axis=-1) qvals = tf_value_iter_no_config(X,
return tf.app.flags.FLAGS if __name__ == '__main__': # get flags || Data config = init_birl_flags() if config.datafile is None: print('--datafile option is required') exit() # seed random generators set_seeds(config.seed) imagetest, rewardtest, ytest = load_dataset(config.datafile)[-3:] for image, reward, policy in zip(imagetest, rewardtest, ytest): mdp = GridworldMdp.from_numpy_input(image, reward) mdp = GridworldMdpLearnableR.from_full_mdp(mdp) inferred_reward = birl(mdp, policy, config.beta, num_burn_in=config.num_burn_in, num_samples=config.num_samples, display_step=config.display_step) print('The first set of walls is:') print(image) print('The first reward should be:') print(reward) inferred_reward = inferred_reward / inferred_reward.max() inferred_reward = np.reshape(inferred_reward, image.shape) print('The inferred reward is:')