def make_mdp(mdp_class="grid", grid_dim=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = grid_dim, grid_dim upworld_goal_locs = [(i, width) for i in range(1, height+1)] four_room_goal_locs = [(width, height)] #, (width, 1), (1, height)] # (1, height - 2), (width - 2, height - 2), (width - 1, height - 1), (width - 2, 1)] four_room_goal_loc = four_room_goal_locs[0] # Taxi stuff. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":grid_dim / 2, "y":grid_dim / 2, "dest_x":grid_dim-2, "dest_y":2, "in_taxi":0}] walls = [] # Trench stuff tr_agent = {"x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": grid_dim, "dest_y": grid_dim, "has_block": 0} blocks = [{"x": grid_dim, "y": 1}] lavas = [{"x": x, "y": y} for x, y in map(lambda z: (z + 1, (grid_dim + 1) / 2), range(grid_dim))] # Do grids separately to avoid making error-prone domains. if mdp_class == "four_room": mdp = FourRoomMDP(width=width, height=height, goal_locs=[four_room_goal_loc]) else: mdp = {"upworld":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=upworld_goal_locs), "chain":ChainMDP(num_states=grid_dim), "random":RandomMDP(num_states=50, num_rand_trans=2), "hanoi":HanoiMDP(num_pegs=grid_dim, num_discs=3), "taxi":TaxiOOMDP(width=grid_dim, height=grid_dim, agent=agent, walls=walls, passengers=passengers), "trench":TrenchOOMDP(width=grid_dim, height=3, agent=tr_agent, blocks=blocks, lavas=lavas)}[mdp_class] return mdp
def choose_mdp(mdp_name, env_name="Asteroids-v0"): ''' Args: mdp_name (str): one of {gym, grid, chain, taxi, ...} gym_env_name (str): gym environment name, like 'CartPole-v0' Returns: (MDP) ''' # Other imports from simple_rl.tasks import ChainMDP, GridWorldMDP, FourRoomMDP, TaxiOOMDP, RandomMDP, PrisonersDilemmaMDP, RockPaperScissorsMDP, GridGameMDP # Taxi MDP. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":4, "y":3, "dest_x":2, "dest_y":2, "in_taxi":0}] walls = [] if mdp_name == "gym": # OpenAI Gym MDP. try: from simple_rl.tasks.gym.GymMDPClass import GymMDP except: raise ValueError("(simple_rl) Error: OpenAI gym not installed.") return GymMDP(env_name, render=True) else: return {"grid":GridWorldMDP(5, 5, (1, 1), goal_locs=[(5, 3), (4,1)]), "four_room":FourRoomMDP(), "chain":ChainMDP(5), "taxi":TaxiOOMDP(10, 10, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers), "random":RandomMDP(num_states=40, num_rand_trans=20), "prison":PrisonersDilemmaMDP(), "rps":RockPaperScissorsMDP(), "grid_game":GridGameMDP(), "multi":{0.5:RandomMDP(num_states=40, num_rand_trans=20), 0.5:RandomMDP(num_states=40, num_rand_trans=5)}}[mdp_name]
def make_mdp(mdp_class="grid", grid_dim=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = grid_dim, grid_dim hall_goal_locs = [(i, width) for i in range(1, height + 1)] four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] four_room_goal_loc = four_room_goal_locs[5] # Taxi stuff. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{ "x": grid_dim / 2, "y": grid_dim / 2, "dest_x": grid_dim - 2, "dest_y": 2, "in_taxi": 0 }] walls = [] mdp = { "hall": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=hall_goal_locs), "pblocks_grid": make_grid_world_from_file("pblocks_grid.txt", randomize=True), "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)]), "four_room": FourRoomMDP(width=width, height=height, goal_locs=[four_room_goal_loc]), "chain": ChainMDP(num_states=grid_dim), "random": RandomMDP(num_states=50, num_rand_trans=2), "hanoi": HanoiMDP(num_pegs=grid_dim, num_discs=3), "taxi": TaxiOOMDP(width=grid_dim, height=grid_dim, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers) }[mdp_class] return mdp
def main(): # Make MDP. grid_dim = 11 mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), slip_prob=0.05, goal_locs=[(grid_dim, grid_dim)], gamma=0.99) # Experiment Type. exp_type = "learn_w_abstr" # For comparing policies and visualizing. beta = 1 is_deterministic_ib = True is_agent_in_control = True # For main plotting experiment. beta_range = list(chart_utils.drange(0.0, 4.0, 1.0)) instances = 1 # Get demo policy. vi = ValueIteration(mdp) _, val = vi.run_vi() # Epsilon greedy policy demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.1)) if exp_type == "plot_info_sa_val_and_num_states": # Makes the main two plots. make_info_sa_val_and_size_plots(mdp, demo_policy, beta_range, instances=instances, is_agent_in_control=is_agent_in_control) elif exp_type == "compare_policies": # Makes a plot comparing value of pi-phi combo from info_sa with \pi_d. info_sa_compare_policies(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control) elif exp_type == "visualize_info_sa_abstr": # Visualize the state abstraction found by info_sa. info_sa_visualize_abstr(mdp, demo_policy, beta=beta, is_deterministic_ib=is_deterministic_ib, is_agent_in_control=is_agent_in_control) elif exp_type == "learn_w_abstr": # Run learning experiments for different settings of \beta. learn_w_abstr(mdp, demo_policy, is_deterministic_ib=is_deterministic_ib) elif exp_type == "planning": info_sa_planning_experiment()
def main(open_plot=True): # Setup MDP, Agents. # mdp = GridWorldMDP(width=4, height=3, init_loc=(1,1), goal_locs=[(4,3)], gamma=0.95, walls=[(2,2)]) mdp = FourRoomMDP(width=11, height=11, init_loc=(1, 1), goal_locs=[(9, 3)], is_goal_terminal=True, slip_prob=0.2) # mdp = ComboLockMDP(combo=[3,1,2], num_actions=3, num_states=3) dq_agent = DoubleQAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=1, steps=10000, open_plot=open_plot)
def make_mdp(mdp_class="grid", state_size=7): ''' Returns: (MDP) ''' # Grid/Hallway stuff. width, height = state_size, state_size hall_goal_locs = [(i, width) for i in range(1, height + 1)] # Taxi stuff. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{ "x": state_size / 2, "y": state_size / 2, "dest_x": state_size - 2, "dest_y": 2, "in_taxi": 0 }] walls = [] mdp = { "hall": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=hall_goal_locs), "pblocks_grid": make_grid_world_from_file("pblocks_grid.txt", randomize=True), "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[(state_size, state_size)]), "four_room": FourRoomMDP(width=width, height=height, goal_locs=[(width, height)]), "chain": ChainMDP(num_states=state_size), "random": RandomMDP(num_states=50, num_rand_trans=2), "taxi": TaxiOOMDP(width=state_size, height=state_size, slip_prob=0.0, agent=agent, walls=walls, passengers=passengers) }[mdp_class] return mdp
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95) ql_agent = QLearnerAgent(mdp.get_actions()) viz = parse_args() if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy vi = ValueIteration(mdp) vi.run_vi() policy = vi.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print "\n", str(ql_agent), "interacting with", str(mdp) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent)
def make_mdp_distr(mdp_class="grid", num_mdps=15, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} num_mdps (int) Returns: (MDPDistribution) ''' mdp_dist_dict = {} mdp_prob = 1.0 / num_mdps height, width = 10, 10 # Make @num_mdps MDPs. for i in xrange(num_mdps): next_goals = rnd.sample([(1, 7), (7, 1), (7, 7), (6, 6), (6, 1), (1, 6)], 2) new_mdp = { "grid": GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=rnd.sample( zip(range(1, width + 1), [height] * width), 1), is_goal_terminal=True, gamma=gamma), "four_room": FourRoomMDP(width=8, height=8, goal_locs=next_goals, gamma=gamma), "chain": ChainMDP(num_states=10, reset_val=rnd.choice([0, 0.01, 0.05, 0.1]), gamma=gamma), "random": RandomMDP(num_states=40, num_rand_trans=rnd.randint(1, 10), gamma=gamma) }[mdp_class] mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict)
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude + 1)] + [j for j in xrange(corr_width-corr_goal_magnitude + 1, corr_width + 1)] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [i for i in xrange(width - 4, width)], [j for j in xrange(height - 4, height)] tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in xrange(1, 4)], [j for j in xrange(height - 4, height)] tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Hallway. upworld_goal_locs = [(i, height) for i in xrange(1, 30)] # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] print four_room_goal_locs tight_four_room_goal_locs = [(width, height), (width, height-1), (width-1, height), (width, height - 2), (width - 2, height), (width-1, height-1)] # Taxi. agent = {"x":1, "y":1, "has_passenger":0} walls = [] goal_loc_dict = {"four_room":four_room_goal_locs, "color":four_room_goal_locs, "upworld":upworld_goal_locs, "grid":grid_goal_locs, "corridor":corr_goal_locs, "tight_four_room":tight_four_room_goal_locs, } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len(goal_loc_dict[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in xrange(num_mdps): new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False), "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "upworld":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["upworld"], name="upworld", is_goal_terminal=True), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"), "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), "color":ColorMDP(width=width, height=height, num_colors=4, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), "tight_four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["tight_four_room"][i % len(goal_loc_dict["tight_four_room"])]], is_goal_terminal=True, name="tight_four_room")}[mdp_class] new_mdp.set_step_cost(step_cost) new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def make_mdp_distr(mdp_class, is_goal_terminal, mdp_size=11, horizon=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width, = mdp_size, mdp_size # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [j for j in range(corr_width-corr_goal_magnitude + 1, corr_width + 1)] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [i for i in range(width - 4, width)], [j for j in range(height - 4, height)] tl_grid_goal_locs = list(itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [j for j in range(height - 4, height)] tr_grid_goal_locs = list(itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1)] # SPREAD vs. TIGHT spread_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2), (width - 2, 1), (2,2)] tight_goal_locs = [(width, height), (width-1, height), (width, height-1), (width, height - 2), (width - 2, height), (width - 1, height-1), (width-2,height-2)] changing_entities = {"four_room":four_room_goal_locs, "grid":grid_goal_locs, "corridor":corr_goal_locs, "spread":spread_goal_locs, "tight":tight_goal_locs, "chain":[0.0, 0.01, 0.1, 0.5, 1.0], "combo_lock":[[3,1,2],[3,2,1],[2,3,1],[3,3,1]], "walls":make_wall_permutations(mdp_size), "lava":make_lava_permutations(mdp_size) } # MDP Probability. num_mdps = 10 if mdp_class not in changing_entities.keys() else len(changing_entities[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"chain":ChainMDP(reset_val=changing_entities["chain"][i%len(changing_entities["chain"])]), # "lava":GridWorldMDP(width=width, height=height, rand_init=False, step_cost=-0.001, lava_cost=0.0, lava_locs=changing_entities["lava"][i%len(changing_entities["lava"])], goal_locs=[(mdp_size-3, mdp_size-3)], is_goal_terminal=is_goal_terminal, name="lava_world", slip_prob=0.1), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[changing_entities["four_room"][i % len(changing_entities["four_room"])]], is_goal_terminal=is_goal_terminal), # "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[changing_entities["corridor"][i % len(changing_entities["corridor"])]], is_goal_terminal=is_goal_terminal, name="corridor"), "combo_lock":ComboLockMDP(combo=changing_entities["combo_lock"][i%len(changing_entities["combo_lock"])]), "spread":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["spread"][i % len(changing_entities["spread"])]], is_goal_terminal=is_goal_terminal, name="spread_grid"), "tight":GridWorldMDP(width=width, height=height, rand_init=False, goal_locs=[changing_entities["tight"][i % len(changing_entities["tight"])]], is_goal_terminal=is_goal_terminal, name="tight_grid"), }[mdp_class] new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def branching_factor_experiment(min_options=0, max_options=20, increment=2, instances=5, epsilon=0.05): ''' Args: min_options (int) max_options (int) increment (int) Summary: Runs an experiment contrasting learning performance for different # options. ''' # Define MDP. grid_size = 7 mdp = FourRoomMDP(width=grid_size, height=grid_size, goal_locs=[(grid_size, grid_size)]) # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) state_abstr = core.compute_phi_given_m(mdp, four_rooms_predicate_9x9, level=1, states=states) x_axis = range(min_options, max_options + 1, increment) y_axis = defaultdict(list) #[] #[0] * len(x_axis) conf_intervals = defaultdict(list) num_options_performance = defaultdict(lambda: defaultdict(list)) # Choose dependent variable (either #steps per episode or #episodes). d_var_range = [(20, 5), (40, 250), (400, 2500)] for steps, episodes in d_var_range: print "steps, episodes", steps, episodes # Evaluate. for i, instance in enumerate(range(instances)): print "\tInstance", instance + 1, "of", str(instances) + "." # Make initial Options. for num_options in x_axis: options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_options - 1, eps=epsilon) action_abstr = ActionAbstraction( options=options, prim_actions=mdp.get_actions()) # Make agent. AgentClass = RMaxAgent # DoubleQAgent, QLearningAgent, SarsaAgent sa_aa_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O$") _, _, value_per_episode = run_single_agent_on_mdp( sa_aa_agent, mdp, episodes=episodes, steps=steps) mdp.reset() num_options_performance[(steps, episodes)][num_options].append( value_per_episode[-1]) ############ # Other types # Just state abstraction. steps, episodes = d_var_range[-1][0], d_var_range[-1][1] sa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=None, name_ext="-$\\phi$") _, _, value_per_episode = run_single_agent_on_mdp(sa_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["phi"].append( value_per_episode[-1]) y_axis["phi"] = [value_per_episode[-1]] # Run random options. options = make_fixed_random_options(mdp, state_abstr) action_abstr = ActionAbstraction(options=options, prim_actions=mdp.get_actions()) AgentClass = QLearningAgent rand_opt_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O_{\text{random}}$") _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["random"].append( value_per_episode[-1]) y_axis["random"] = [value_per_episode[-1]] # Makeoptimal agent. value_iter = ValueIteration(mdp) value_iter.run_vi() optimal_agent = FixedPolicyAgent(value_iter.policy) _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent, mdp, episodes=episodes, steps=steps) y_axis["optimal"] = [value_per_episode[-1]] total_steps = d_var_range[0][0] * d_var_range[0][1] # Confidence intervals. for dependent_var in d_var_range: for num_options in x_axis: # Compute mean and standard error. avg_for_n = float( sum(num_options_performance[dependent_var] [num_options])) / instances std_deviation = np.std( num_options_performance[dependent_var][num_options]) std_error = 1.96 * (std_deviation / math.sqrt( len(num_options_performance[dependent_var][num_options]))) y_axis[dependent_var].append(avg_for_n) conf_intervals[dependent_var].append(std_error) plt.xlabel("$|O_\\phi|$") plt.xlim([1, len(x_axis)]) plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$") plt.tight_layout() # Keeps the spacing nice. # Add just state abstraction. ep_val_del_q_phi = y_axis["phi"] label = "$O_{\\phi}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis), marker="+", linestyle="--", linewidth=1.0, color=PLOT_COLORS[-1], label=label) # Add random options. ep_val_del_q = y_axis["random"] label = "$O_{random}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q] * len(x_axis), marker="x", linestyle="--", linewidth=1.0, color=PLOT_COLORS[0]) #, label=label) # Add optimal. ep_val_optimal = y_axis["optimal"] plt.plot(x_axis, [ep_val_optimal] * len(x_axis), linestyle="-", linewidth=1.0, color=PLOT_COLORS[1]) #, label="$\\pi^*$") for i, dependent_var in enumerate(d_var_range): total_steps = dependent_var[0] * dependent_var[1] label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str( str(total_steps).count("0")) + "$" plt.plot(x_axis, y_axis[dependent_var], marker="x", color=PLOT_COLORS[i + 2], linewidth=1.5, label=label) # Confidence intervals. top = np.add(y_axis[dependent_var], conf_intervals[dependent_var]) bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var]) plt.fill_between(x_axis, top, bot, alpha=0.25, color=PLOT_COLORS[i + 2]) plt.legend() plt.savefig("branching_factor_results.pdf", format="pdf") plt.cla() plt.close()
def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0): ''' Args: min_grid_size (int) max_grid_size (int) beta (float): Hyperparameter for InfoSA. Summary: Writes num iterations and time (seconds) for planning with and without abstractions. ''' vanilla_file = "vi.csv" sa_file = "vi-$\\phi$.csv" file_prefix = os.path.join("results", "planning-four_room") clear_files(dir_name=file_prefix) for grid_dim in xrange(min_grid_size, max_grid_size + 1): # ====================== # == Make Environment == # ====================== mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9) # Get demo policy. vi = ValueIteration(mdp) vi.run_vi() demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2)) # ======================= # == Make Abstractions == # ======================= pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001) lambda_abstr_policy = get_lambda_policy(abstr_policy) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) # ============ # == Run VI == # ============ vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25) sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25) # ========== # == Plan == # ========== print "Running VIs." start_time = time.clock() vanilla_iters, vanilla_val = vanilla_vi.run_vi() vanilla_time = round(time.clock() - start_time, 2) mdp.reset() start_time = time.clock() sa_iters, sa_abs_val = sa_vi.run_vi() sa_time = round(time.clock() - start_time, 2) sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25) print "\n" + "*"*20 print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time print print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time print "*"*20 + "\n\n" write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters) write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters) write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time) write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4) viz = parse_args() # Choose viz type. viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent) elif viz == "interactive": mdp.visualize_interaction()
def run_learning_experiment(): """ Summary: Builds different sets of options and contrasts how RL algorithms perform when learning with them. """ # Define MDP. width, height = 11, 11 mdp = FourRoomMDP(width=width, height=height, goal_locs=[(width, height)], slip_prob=0.05) actions = mdp.get_actions() # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) if isinstance(mdp, FourRoomMDP): predicate = four_rooms_predicate_11x11 else: predicate = reachable_in_n_steps_predicate state_abstr = core.compute_phi_given_m(mdp, predicate, level=1, states=states) # Make initial Options. num_rand_opts_to_add = 2 options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_rand_opts_to_add, eps=0.05) action_abstr = ActionAbstraction(options=options, prim_actions=actions) action_abstr_w_prims = ActionAbstraction(options=options, prim_actions=actions, incl_primitives=True) # Find eigen options. # num_eigen_options = max(1, num_rand_opts_to_add - 1) # eigen_options_init_all = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=True) # eigen_options_w_prims = find_eigenoptions(mdp, num_options=num_eigen_options, init_everywhere=False) # eigen_aa_init_all = ActionAbstraction(options=eigen_options_init_all, prim_actions=actions, incl_primitives=False) # eigen_aa_w_prims = ActionAbstraction(options=eigen_options_w_prims, prim_actions=actions, incl_primitives=True) # Make agent. AgentClass = QLearningAgent #QLearningAgent #DoubleQAgent #DelayedQAgent ql_agent = AgentClass(mdp.get_actions()) sa_aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=state_abstr, action_abstr=action_abstr_w_prims, name_ext="-$\\phi,O$") aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=action_abstr_w_prims, name_ext="-$O$") # aa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=action_abstr_w_prims, name_ext="-$\\phi$") # Eigen agents. # eigen_agent_init_all = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_init_all, name_ext="-eigen_all") # eigen_agent_w_prims = AbstractionWrapper(AgentClass, agent_params={"actions": actions}, state_abstr=None, action_abstr=eigen_aa_w_prims, name_ext="-eigen_w_prims") agents = [ql_agent, aa_agent, sa_aa_agent] #, eigen_agent_init_all, eigen_agent_w_prims] # Run. if isinstance(mdp, FourRoomMDP): run_agents_on_mdp(agents, mdp, instances=10, episodes=500, steps=50) else: run_agents_on_mdp(agents, mdp, instances=10, episodes=100, steps=10)
def make_mdp_distr(mdp_class="grid", grid_dim=7, horizon=0): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = random.randint(1, 5) corr_goal_cols = [i for i in xrange(1, corr_goal_magnitude)] + [ j for j in xrange(corr_width - corr_goal_magnitude, corr_width + 1) ] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World grid_world_rows, grid_world_cols = [i for i in xrange(width - 4, width)], [ j for j in xrange(height - 4, height) ] grid_goal_locs = list(itertools.product(grid_world_rows, grid_world_cols)) # Hallway. hall_goal_locs = [(i, width) for i in range(1, height + 1)] # Four room. four_room_goal_locs = [(2, 2), (width, height), (width, 1), (1, height)] # Taxi. agent = {"x": 1, "y": 1, "has_passenger": 0} walls = [] goal_loc_dict = { "four_room": four_room_goal_locs, "hall": hall_goal_locs, "grid": grid_goal_locs, "corridor": corr_goal_locs } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len( goal_loc_dict[mdp_class]) mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"hall":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["hall"][i % len(goal_loc_dict["hall"])]]), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True), "grid":GridWorldMDP(width=width, height=height, init_loc=(1, 1), goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]]), # THESE GOALS ARE SPECIFIED IMPLICITLY: "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True), "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])), "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)), "taxi":TaxiOOMDP(4, 4, slip_prob=0.0, agent=agent, walls=walls, \ passengers=[{"x":2, "y":2, "dest_x":random.randint(1,4), "dest_y":random.randint(1,4), "in_taxi":0}])}[mdp_class] mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
s = torch.tensor((x, y)).float() action = torch.argmax( alg.action_distr(s, torch.tensor(skill)).logits).item() action_word = dict(enumerate(alg.env.action_map)).get( action, 'term') policy[x, y] = action_word return policy plt.figure() #%% reset_seeds(0) from simple_rl.tasks import FourRoomMDP from notebooks.simple_rl_env import SimpleGymEnv env = SimpleGymEnv(FourRoomMDP(12, 12, goal_locs=[(12, 12)])) env.render() s0 = torch.as_tensor(env.reset(), dtype=torch.float32) ndim_s = len(env.observation_space) n_actions = env.action_space.n n_skills = 40 gamma = 0.99 max_steps_per_skill = 10 n_units = 32 lr = 1e-3 alg = VIC # alg = DIAYN alg = alg(env, ndim_s, n_actions, n_skills, gamma, max_steps_per_skill, n_units, lr) start_policies = [get_policy(alg, env.mdp, s) for s in range(n_skills)] #%%
def make_mdp_distr(mdp_class="grid", grid_dim=9, horizon=0, step_cost=0, gamma=0.99): ''' Args: mdp_class (str): one of {"grid", "random"} horizon (int) step_cost (float) gamma (float) Returns: (MDPDistribution) ''' mdp_dist_dict = {} height, width = grid_dim, grid_dim # Define goal locations. # Corridor. corr_width = 20 corr_goal_magnitude = 1 #random.randint(1, 5) corr_goal_cols = [i for i in range(1, corr_goal_magnitude + 1)] + [ j for j in range(corr_width - corr_goal_magnitude + 1, corr_width + 1) ] corr_goal_locs = list(itertools.product(corr_goal_cols, [1])) # Grid World tl_grid_world_rows, tl_grid_world_cols = [ i for i in range(width - 4, width) ], [j for j in range(height - 4, height)] tl_grid_goal_locs = list( itertools.product(tl_grid_world_rows, tl_grid_world_cols)) tr_grid_world_rows, tr_grid_world_cols = [i for i in range(1, 4)], [ j for j in range(height - 4, height) ] tr_grid_goal_locs = list( itertools.product(tr_grid_world_rows, tr_grid_world_cols)) grid_goal_locs = tl_grid_goal_locs + tr_grid_goal_locs # Hallway. hall_goal_locs = [(i, height) for i in range(1, 30)] # Four room. four_room_goal_locs = [(width, height), (width, 1), (1, height), (1, height - 2), (width - 2, height - 2)] #, (width - 2, 1)] # Taxi. agent = {"x": 1, "y": 1, "has_passenger": 0} walls = [] goal_loc_dict = { "four_room": four_room_goal_locs, "hall": hall_goal_locs, "grid": grid_goal_locs, "corridor": corr_goal_locs, } # MDP Probability. num_mdps = 10 if mdp_class not in goal_loc_dict.keys() else len( goal_loc_dict[mdp_class]) if mdp_class == "octo": num_mdps = 12 mdp_prob = 1.0 / num_mdps for i in range(num_mdps): new_mdp = {"hrooms":make_grid_world_from_file("hierarch_rooms.txt", num_goals=7, randomize=False), "octo":make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False, goal_num=i), "hall":GridWorldMDP(width=30, height=height, rand_init=False, goal_locs=goal_loc_dict["hall"], name="hallway", is_goal_terminal=True), "corridor":GridWorldMDP(width=20, height=1, init_loc=(10, 1), goal_locs=[goal_loc_dict["corridor"][i % len(goal_loc_dict["corridor"])]], is_goal_terminal=True, name="corridor"), "grid":GridWorldMDP(width=width, height=height, rand_init=True, goal_locs=[goal_loc_dict["grid"][i % len(goal_loc_dict["grid"])]], is_goal_terminal=True), "four_room":FourRoomMDP(width=width, height=height, goal_locs=[goal_loc_dict["four_room"][i % len(goal_loc_dict["four_room"])]], is_goal_terminal=True), # THESE GOALS ARE SPECIFIED IMPLICITLY: "pblocks_grid":make_grid_world_from_file("pblocks_grid.txt", randomize=True, slip_prob=0.1), "chain":ChainMDP(num_states=10, reset_val=random.choice([0, 0.01, 0.05, 0.1, 0.2, 0.5])), "random":RandomMDP(num_states=40, num_rand_trans=random.randint(1,10)), "taxi":TaxiOOMDP(3, 4, slip_prob=0.0, agent=agent, walls=walls, \ passengers=[{"x":2, "y":1, "dest_x":random.choice([2,3]), "dest_y":random.choice([2,3]), "in_taxi":0}, {"x":1, "y":2, "dest_x":random.choice([1,2]), "dest_y":random.choice([1,4]), "in_taxi":0}])}[mdp_class] new_mdp.set_step_cost(step_cost) new_mdp.set_gamma(gamma) mdp_dist_dict[new_mdp] = mdp_prob return MDPDistribution(mdp_dist_dict, horizon=horizon)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
pass parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) sys.path.insert(0, parent_dir) from simple_rl.agents import QLearningAgent, RandomAgent, RMaxAgent from simple_rl.planning import ValueIteration from simple_rl.tasks import GridWorldMDP from simple_rl.run_experiments import run_agents_on_mdp # # Setup MDP. # mdp = GridWorldMDP(width=6, height=6, init_loc=(1, 1), goal_locs=[(6, 6)]) # # # Setup Agents. # ql_agent = QLearningAgent(actions=mdp.get_actions()) # rand_agent = RandomAgent(actions=mdp.get_actions()) # rmax_agent = RMaxAgent(actions=mdp.get_actions(), horizon=3, s_a_threshold=1) # # # Run experiment and make plot. # run_agents_on_mdp([ql_agent, rand_agent, rmax_agent], mdp, instances=5, episodes=100, steps=40, reset_at_terminal=True, # verbose=False) from simple_rl.tasks import FourRoomMDP from simple_rl.tasks.grid_world import grid_visualizer four_room_mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9), (5, 4)], gamma=0.95) # Run experiment and make plot. # four_room_mdp.visualize_value() four_room_mdp.visualize_interaction() # four_room_mdp.visualize_policy()