def main(): # Make MDP Distribution. mdp_class = "four_room" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=10) sa_stack = make_random_sa_stack(environment, max_num_levels=5) sa_stack.print_state_space_sizes()
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "four_room" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=10) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3) sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3) mdp = environment.sample() HVI = HierarchicalValueIteration(mdp, sa_stack, aa_stack) VI = ValueIteration(mdp) h_iters, h_val = HVI.run_vi() iters, val = VI.run_vi() print "H:", h_iters, h_val print "V:", iters, val
def main(open_plot=True): # Make MDP distribution, agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearnerAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Run experiment and make plot. run_agents_multi_task([ql_agent, rand_agent], mdp_distr, task_samples=50, episodes=1, steps=1500, reset_at_terminal=True, open_plot=open_plot)
def main(): # ====================== # == Make Environment == # ====================== mdp_class = "four_room" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=7) actions = environment.get_actions() # ==================== # == Make Hierarchy == # ==================== sa_stack, aa_stack = make_hierarchy(environment, num_levels=3)
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions":mdp_distr.get_actions()}, action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "four_room" gamma = 1.0 environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, step_cost=0.01, grid_dim=15, gamma=gamma) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=2) # Debug. print "\n" + ("=" * 30) + "\n== Done making abstraction. ==\n" + ("=" * 30) + "\n" sa_stack.print_state_space_sizes() aa_stack.print_action_spaces_sizes() # =================== # === Make Agents === # =================== # baseline_agent = QLearnerAgent(actions) agent_class = QLearnerAgent baseline_agent = agent_class(actions, gamma=gamma) rand_agent = RandomAgent(actions) # hierarch_r_max = HRMaxAgent(actions, sa_stack=sa_stack, aa_stack=aa_stack) l0_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$") l1_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$") # l2_hierarch_agent = HierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$") dynamic_hierarch_agent = DynamicHierarchyAgent(agent_class, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") print "\n" + ("=" * 26) print "== Running experiments. ==" print "=" * 26 + "\n" # ====================== # === Run Experiment === # ====================== agents = [dynamic_hierarch_agent, baseline_agent] run_agents_multi_task(agents, environment, task_samples=10, steps=20000, episodes=1, reset_at_terminal=True)
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "hrooms" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3) sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3) # Debug. print("\n" + ("=" * 30)) print("== Done making abstraction. ==") print("=" * 30 + "\n") sa_stack.print_state_space_sizes() print("Num Action Abstractions:", len(aa_stack.get_aa_list())) # =================== # === Make Agents === # =================== baseline_agent = QLearningAgent(actions) rmax_agent = RMaxAgent(actions) rand_agent = RandomAgent(actions) l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$") l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$") # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$") dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") print("\n" + ("=" * 26)) print("== Running experiments. ==") print("=" * 26 + "\n") # ====================== # === Run Experiment === # ====================== agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent] run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
def main(): # MDP Setting. lifelong = True mdp_class = "four_room" grid_dim = 11 # Make MDP. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=grid_dim) actions = mdp_distr.get_actions() experiment_type = "aa" goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=actions, options=goal_based_options) # Visualize Action Abstractions. visualize_options_grid(mdp_distr, goal_based_aa) input("Press any key to quit ") quit()
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearningAgent, actions=mdp_distr.get_actions(), action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def main(): # Grab experiment params. mdp_class, task_samples, episodes, steps, grid_dim, x_axis_num_options, agent_class_str, max_options, exp_type = parse_args( ) gamma = 0.9 # ======================== # === Make Environment === # ======================== multi_task = True max_option_steps = 50 if x_axis_num_options else 0 environment = make_mdp.make_mdp_distr( mdp_class=mdp_class, grid_dim=grid_dim) if multi_task else make_mdp.make_mdp( mdp_class=mdp_class) actions = environment.get_actions() environment.set_gamma(gamma) # Indicator functions. v_indic = ind_funcs._v_approx_indicator q_indic = ind_funcs._q_eps_approx_indicator v_disc_indic = ind_funcs._v_disc_approx_indicator rand_indic = ind_funcs._random # ========================= # === Make Abstractions === # ========================= # Directed Variants. v_directed_sa, v_directed_aa = get_abstractions(environment, v_disc_indic, directed=True, max_options=max_options) # v_directed_sa, v_directed_aa = get_abstractions(environment, v_indic, directed=True, max_options=max_options) # Identity action abstraction. identity_sa, identity_aa = get_sa(environment, default=True), get_aa(environment, default=True) if exp_type == "core": # Core only abstraction types. q_directed_sa, q_directed_aa = get_abstractions( environment, q_indic, directed=True, max_options=max_options) rand_directed_sa, rand_directed_aa = get_abstractions( environment, rand_indic, directed=True, max_options=max_options) pblocks_sa, pblocks_aa = get_sa( environment, default=True), action_abs.aa_baselines.get_policy_blocks_aa( environment, incl_prim_actions=True, num_options=max_options) # =================== # === Make Agents === # =================== # Base Agents. agent_class = QLearningAgent if agent_class_str == "ql" else RMaxAgent rand_agent = RandomAgent(actions) baseline_agent = agent_class(actions, gamma=gamma) if mdp_class == "pblocks": baseline_agent.epsilon = 0.01 # Abstraction Extensions. agents = [] vabs_agent_directed = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=v_directed_sa, action_abstr=v_directed_aa, name_ext="v-sa+aa") if exp_type == "core": # Core only agents. qabs_agent_directed = AbstractionWrapper( agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=q_directed_sa, action_abstr=q_directed_aa, name_ext="q-sa+aa") rabs_agent_directed = AbstractionWrapper( agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=rand_directed_sa, action_abstr=rand_directed_aa, name_ext="rand-sa+aa") pblocks_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=pblocks_sa, action_abstr=pblocks_aa, name_ext="pblocks") agents = [ vabs_agent_directed, qabs_agent_directed, rabs_agent_directed, pblocks_agent, baseline_agent ] elif exp_type == "combo": # Combo only agents. aa_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=identity_sa, action_abstr=v_directed_aa, name_ext="aa") sa_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=v_directed_sa, action_abstr=identity_aa, name_ext="sa") agents = [vabs_agent_directed, sa_agent, aa_agent, baseline_agent] # Run experiments. if multi_task: steps = 999999 if x_axis_num_options else steps run_agents_multi_task(agents, environment, task_samples=task_samples, steps=steps, episodes=episodes, reset_at_terminal=True) else: run_agents_on_mdp(agents, environment, instances=20, episodes=30, reset_at_terminal=True)
def main(): # Make MDP Distribution. mdp_class = "four_room" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, grid_dim=10) make_random_sa_diropt_aa_stack(environment, max_num_levels=3)