def main(): # Command line args. task, rom = parse_args() # Setup the MDP. mdp = choose_mdp(task, rom) actions = mdp.get_actions() gamma = mdp.get_gamma() # Setup agents. from simple_rl.agents import RandomAgent, RMaxAgent, QLearnerAgent, LinearQLearnerAgent random_agent = RandomAgent(actions) rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=4, s_a_threshold=2) qlearner_agent = QLearnerAgent(actions, gamma=gamma, explore="uniform") lqlearner_agent = LinearQLearnerAgent(actions, gamma=gamma, explore="uniform") agents = [qlearner_agent, random_agent] # Run Agents. if isinstance(mdp, MarkovGameMDP): # Markov Game. agents = { qlearner_agent.name: qlearner_agent, random_agent.name: random_agent } play_markov_game(agents, mdp, instances=100, episodes=1, steps=500) else: # Regular experiment. run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
def main(): # Grab experiment params. mdp = BadChainMDP(gamma=0.95, kappa=0.001) actions = mdp.get_actions() # ======================= # == Make Abstractions == # ======================= sa_q_eps = get_sa(mdp, indic_func=indicator_funcs._q_eps_approx_indicator, epsilon=0.1) # RMax Agents. rmax_agent = RMaxAgent(actions) abstr_rmax_agent = AbstractionWrapper(RMaxAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") # Delayed Q Agents. del_q_agent = DelayedQAgent(actions) abstr_del_q_agent = AbstractionWrapper(DelayedQAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") run_agents_on_mdp( [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent], mdp, instances=50, steps=250, episodes=1)
def main(): # create mdp using own definition mdp = tfeMDP() # Three different agents to compare how each do against each other rand_agent = RandomAgent(actions=mdp.get_actions()) rmax_agent = RMaxAgent(actions=mdp.get_actions()) agent = QLearningAgent(actions=mdp.get_actions()) # Function that actually runs everything and generates the appropriate # graphs and statistics defining how each agent did run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp, instances=200, episodes=100, steps=1000)
def get_combo_experiment_agents(environment): ''' Args: environment (simple_rl.MDPDistribution) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() sa, aa = get_directed_option_sa_pair( environment, indic_func=ind_funcs._q_disc_approx_indicator, max_options=100) sa_qds_test = get_sa(environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.05) sa_qs_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.1) # QLearner. ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) # Combos. ql_sa_qds_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qds_test, name_ext="$\phi_{Q_d^*}$") ql_sa_qs_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qs_test, name_ext="$\phi_{Q_\epsilon^*}$") # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa") aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, action_abstr=aa, name_ext="aa") sa_aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa, action_abstr=aa, name_ext="$\phi_{Q_d^*}+aa$") agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent] return agents
def main(): # ======================== # === Make Environment === # ======================== mdp_class = "hrooms" environment = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = environment.get_actions() # ========================== # === Make SA, AA Stacks === # ========================== # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3) sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3) # Debug. print("\n" + ("=" * 30)) print("== Done making abstraction. ==") print("=" * 30 + "\n") sa_stack.print_state_space_sizes() print("Num Action Abstractions:", len(aa_stack.get_aa_list())) # =================== # === Make Agents === # =================== baseline_agent = QLearningAgent(actions) rmax_agent = RMaxAgent(actions) rand_agent = RandomAgent(actions) l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$") l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$") # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$") dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$") print("\n" + ("=" * 26)) print("== Running experiments. ==") print("=" * 26 + "\n") # ====================== # === Run Experiment === # ====================== agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent] run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
def main(): # Setup MDP. w = 6 h = 6 mdp = GridWorld(width=w, height=h, init_loc=(1, 1), goal_locs=[(6, 6)], slip_prob=.1) # Setup Agents. rand_agent = RandomAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta) compute_n_samples = False if compute_n_samples: epsilon = .1 delta = .05 m_r = np.log(2. / delta) / (2. * epsilon**2) m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon **2) n_samples = int(max(m_r, m_t)) else: n_samples = 30 simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(), gamma=.9, horizon=3, s_a_threshold=n_samples, name='SimpleRL-R-MAX') rmax_agent = RMax(actions=mdp.get_actions(), gamma=.9, count_threshold=n_samples) # Run experiment and make plot. run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent], mdp, instances=5, episodes=100, steps=20, reset_at_terminal=True, verbose=False)
def main(open_plot=True): # Setup MDP. args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) if args.visualize: value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) else: custom_q = parse_custom_q_table(args.custom_q, args.default_q) agents = [] for agent in args.agents: if agent == 'q_learning': agents.append(QLearningAgent(actions=mdp.get_actions())) elif agent == 'potential_q': agents.append( QLearningAgent(actions=mdp.get_actions(), custom_q_init=custom_q, name="Potential_Q")) elif agent == 'random': agents.append(RandomAgent(actions=mdp.get_actions())) elif agent == 'rmax': agents.append(RMaxAgent(mdp.get_actions())) # Run experiment and make plot. run_agents_on_mdp(agents, mdp, instances=1, episodes=100, steps=100, open_plot=open_plot, verbose=True)
def main(): # Command line args. task, rom = parse_args() # Setup the MDP. mdp = choose_mdp(task, rom) actions = mdp.get_actions() gamma = mdp.get_gamma() # Setup agents. random_agent = RandomAgent(actions) rmax_agent = RMaxAgent(actions, gamma=gamma) qlearner_agent = QLearnerAgent(actions, gamma=gamma) lin_approx_agent = LinearApproxQLearnerAgent(actions, gamma=gamma) grad_boost_agent = GradientBoostingAgent(actions, gamma=gamma, explore="softmax") # Choose agents. agents = [lin_approx_agent, random_agent] # Run experiments. run_agents_on_mdp(agents, mdp)
def main(eps=0.1, open_plot=True): mdp_class, is_goal_terminal, samples, alg = parse_args() # Setup multitask setting. mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class) actions = mdp_distr.get_actions() # Compute average MDP. print "Making and solving avg MDP...", sys.stdout.flush() avg_mdp = compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() ### Yuu transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed") rand_agent = RandomAgent(actions, name="$\pi^u$") opt_q_func = compute_optimistic_q_function(mdp_distr) avg_q_func = avg_mdp_vi.get_q_function() if alg == "q": pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0") qmax = 1.0 * (1 - 0.99) # qmax = 1.0 pure_ql_agent_opt = QLearnerAgent(actions, epsilon=eps, default_q=qmax, name="Q-vmax") transfer_ql_agent_optq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-max") transfer_ql_agent_optq.set_init_q_function(opt_q_func) transfer_ql_agent_avgq = QLearnerAgent(actions, epsilon=eps, name="Q-trans-avg") transfer_ql_agent_avgq.set_init_q_function(avg_q_func) agents = [ pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq, transfer_ql_agent_avgq ] elif alg == "rmax": pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax") updating_trans_rmax_agent = UpdatingRMaxAgent(actions, name="RMAX-updating_max") trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max") trans_rmax_agent.set_init_q_function(opt_q_func) agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent] elif alg == "delayed-q": pure_delayed_ql_agent = DelayedQLearnerAgent(actions, opt_q_func, name="DelayedQ-vmax") pure_delayed_ql_agent.set_vmax() updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent( actions, name="DelayedQ-updating_max") trans_delayed_ql_agent = DelayedQLearnerAgent( actions, opt_q_func, name="DelayedQ-trans-max") agents = [ pure_delayed_ql_agent, updating_delayed_ql_agent, trans_delayed_ql_agent ] else: print "Unknown type of agents:", alg print "(q, rmax, delayed-q)" assert (False) # Run task. # TODO: Function for Learning on each MDP run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=100, reset_at_terminal=is_goal_terminal, is_rec_disc_reward=False, cumulative_plot=True, open_plot=open_plot)
def get_exact_vs_approx_agents(environment, incl_opt=True): ''' Args: environment (simple_rl.MDPDistribution) incl_opt (bool) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() exact_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) ql_exact_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") ql_approx_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent] dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) dql_exact_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") dql_approx_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent] rm_agent = RMaxAgent(actions, gamma=gamma) rm_exact_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") rm_approx_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent] if incl_opt: vi = ValueIteration(environment) vi.run_vi() opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$") sa_vi = AbstractValueIteration( environment, sample_rate=50, max_iterations=3000, delta=0.0001, state_abstr=approx_qds_test, action_abstr=ActionAbstraction( options=[], prim_actions=environment.get_actions())) sa_vi.run_vi() approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$") dql_agents += [opt_agent, approx_opt_agent] return ql_agents
from simple_rl.agents import QLearningAgent, RandomAgent, RMaxAgent from simple_rl.tasks import GridWorldMDP from simple_rl.run_experiments import run_agents_on_mdp # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Setup Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rmax_agent = RMaxAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rmax_agent, rand_agent], mdp, instances=5, episodes=50, steps=10)
def main(): # Setup environment. mdp_class, agent_type, samples = parse_args() is_goal_terminal = False mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute priors. # Stochastic mixture. mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy) # Avg MDP avg_mdp = ape.compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() # Make agents. # Q Learning ql_agent = QLearnerAgent(actions) shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy, actions=actions, name="Prior-QLearning") shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy, actions=actions, name="AvgMDP-QLearning") # RMax rmax_agent = RMaxAgent(actions) shaped_rmax_agent_prior = ShapedRMaxAgent( shaping_policy=opt_stoch_policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="Prior-RMax") shaped_rmax_agent_avgmdp = ShapedRMaxAgent( shaping_policy=avg_mdp_vi.policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="AvgMDP-RMax") prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr) if agent_type == "rmax": agents = [ rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp, prune_rmax_agent ] else: agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=200, is_rec_disc_reward=False, verbose=True)