示例#1
0
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    from simple_rl.agents import RandomAgent, RMaxAgent, QLearnerAgent, LinearQLearnerAgent

    random_agent = RandomAgent(actions)
    rmax_agent = RMaxAgent(actions, gamma=gamma, horizon=4, s_a_threshold=2)
    qlearner_agent = QLearnerAgent(actions, gamma=gamma, explore="uniform")
    lqlearner_agent = LinearQLearnerAgent(actions,
                                          gamma=gamma,
                                          explore="uniform")
    agents = [qlearner_agent, random_agent]

    # Run Agents.
    if isinstance(mdp, MarkovGameMDP):
        # Markov Game.
        agents = {
            qlearner_agent.name: qlearner_agent,
            random_agent.name: random_agent
        }
        play_markov_game(agents, mdp, instances=100, episodes=1, steps=500)
    else:
        # Regular experiment.
        run_agents_on_mdp(agents, mdp, instances=50, episodes=1, steps=2000)
示例#2
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01)
    # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) 
    rm_agent = RMaxAgent(mdp.get_actions())
    viz = parse_args()
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
示例#3
0
def main():

    # Grab experiment params.
    mdp = BadChainMDP(gamma=0.95, kappa=0.001)
    actions = mdp.get_actions()

    # =======================
    # == Make Abstractions ==
    # =======================
    sa_q_eps = get_sa(mdp,
                      indic_func=indicator_funcs._q_eps_approx_indicator,
                      epsilon=0.1)

    # RMax Agents.
    rmax_agent = RMaxAgent(actions)
    abstr_rmax_agent = AbstractionWrapper(RMaxAgent,
                                          state_abstr=sa_q_eps,
                                          agent_params={"actions": actions},
                                          name_ext="-$\\phi_{Q_\\epsilon^*}$")

    # Delayed Q Agents.
    del_q_agent = DelayedQAgent(actions)
    abstr_del_q_agent = AbstractionWrapper(DelayedQAgent,
                                           state_abstr=sa_q_eps,
                                           agent_params={"actions": actions},
                                           name_ext="-$\\phi_{Q_\\epsilon^*}$")

    run_agents_on_mdp(
        [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent],
        mdp,
        instances=50,
        steps=250,
        episodes=1)
示例#4
0
def main():
    # create mdp using own definition
    mdp = tfeMDP()

    # Three different agents to compare how each do against each other
    rand_agent = RandomAgent(actions=mdp.get_actions())
    rmax_agent = RMaxAgent(actions=mdp.get_actions())
    agent = QLearningAgent(actions=mdp.get_actions())

    # Function that actually runs everything and generates the appropriate
    # graphs and statistics defining how each agent did
    run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp,
                      instances=200, episodes=100, steps=1000)
示例#5
0
def get_combo_experiment_agents(environment):
    '''
    Args:
        environment (simple_rl.MDPDistribution)

    Returns:
        (list)
    '''
    actions = environment.get_actions()
    gamma = environment.get_gamma()

    sa, aa = get_directed_option_sa_pair(
        environment,
        indic_func=ind_funcs._q_disc_approx_indicator,
        max_options=100)
    sa_qds_test = get_sa(environment,
                         indic_func=ind_funcs._q_disc_approx_indicator,
                         epsilon=0.05)
    sa_qs_test = get_sa(environment,
                        indic_func=ind_funcs._q_eps_approx_indicator,
                        epsilon=0.1)

    # QLearner.
    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)

    # Combos.
    ql_sa_qds_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=sa_qds_test,
                                         name_ext="$\phi_{Q_d^*}$")
    ql_sa_qs_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=sa_qs_test,
                                        name_ext="$\phi_{Q_\epsilon^*}$")

    # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa")
    aa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={"actions": actions},
                                  action_abstr=aa,
                                  name_ext="aa")
    sa_aa_agent = AbstractionWrapper(QLearningAgent,
                                     agent_params={"actions": actions},
                                     state_abstr=sa,
                                     action_abstr=aa,
                                     name_ext="$\phi_{Q_d^*}+aa$")

    agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent]

    return agents
def main():

    # ========================
    # === Make Environment ===
    # ========================
    mdp_class = "hrooms"
    environment = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = environment.get_actions()

    # ==========================
    # === Make SA, AA Stacks ===
    # ==========================
    # sa_stack, aa_stack = aa_stack_h.make_random_sa_diropt_aa_stack(environment, max_num_levels=3)
    sa_stack, aa_stack = hierarchy_helpers.make_hierarchy(environment, num_levels=3)

    # Debug.
    print("\n" + ("=" * 30))
    print("== Done making abstraction. ==")
    print("=" * 30 + "\n")
    sa_stack.print_state_space_sizes()
    print("Num Action Abstractions:", len(aa_stack.get_aa_list()))

    # ===================
    # === Make Agents ===
    # ===================
    baseline_agent = QLearningAgent(actions)
    rmax_agent = RMaxAgent(actions)
    rand_agent = RandomAgent(actions)
    l0_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=0, name_ext="-$l_0$")
    l1_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$l_1$")
    # l2_hierarch_agent = HierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=2, name_ext="-$l_2$")
    dynamic_hierarch_agent = DynamicHierarchyAgent(QLearningAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")
    # dynamic_rmax_hierarch_agent = DynamicHierarchyAgent(RMaxAgent, sa_stack=sa_stack, aa_stack=aa_stack, cur_level=1, name_ext="-$d$")

    print("\n" + ("=" * 26))
    print("== Running experiments. ==")
    print("=" * 26 + "\n")

    # ======================
    # === Run Experiment ===
    # ======================
    agents = [l1_hierarch_agent, dynamic_hierarch_agent, baseline_agent]
    run_agents_multi_task(agents, environment, task_samples=10, steps=1500, episodes=1, reset_at_terminal=True)
示例#7
0
def main():
    # Setup MDP.
    w = 6
    h = 6
    mdp = GridWorld(width=w,
                    height=h,
                    init_loc=(1, 1),
                    goal_locs=[(6, 6)],
                    slip_prob=.1)

    # Setup Agents.
    rand_agent = RandomAgent(actions=mdp.get_actions())
    ql_agent = QLearningAgent(actions=mdp.get_actions())

    # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta)
    compute_n_samples = False
    if compute_n_samples:
        epsilon = .1
        delta = .05
        m_r = np.log(2. / delta) / (2. * epsilon**2)
        m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon
                                                                       **2)
        n_samples = int(max(m_r, m_t))
    else:
        n_samples = 30

    simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(),
                                     gamma=.9,
                                     horizon=3,
                                     s_a_threshold=n_samples,
                                     name='SimpleRL-R-MAX')
    rmax_agent = RMax(actions=mdp.get_actions(),
                      gamma=.9,
                      count_threshold=n_samples)

    # Run experiment and make plot.
    run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent],
                      mdp,
                      instances=5,
                      episodes=100,
                      steps=20,
                      reset_at_terminal=True,
                      verbose=False)
def main(open_plot=True):

    # Setup MDP.

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    if args.visualize:
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))

    else:
        custom_q = parse_custom_q_table(args.custom_q, args.default_q)

        agents = []
        for agent in args.agents:
            if agent == 'q_learning':
                agents.append(QLearningAgent(actions=mdp.get_actions()))
            elif agent == 'potential_q':
                agents.append(
                    QLearningAgent(actions=mdp.get_actions(),
                                   custom_q_init=custom_q,
                                   name="Potential_Q"))
            elif agent == 'random':
                agents.append(RandomAgent(actions=mdp.get_actions()))
            elif agent == 'rmax':
                agents.append(RMaxAgent(mdp.get_actions()))

        # Run experiment and make plot.
        run_agents_on_mdp(agents,
                          mdp,
                          instances=1,
                          episodes=100,
                          steps=100,
                          open_plot=open_plot,
                          verbose=True)
示例#9
0
def main():
    # Command line args.
    task, rom = parse_args()

    # Setup the MDP.
    mdp = choose_mdp(task, rom)
    actions = mdp.get_actions()
    gamma = mdp.get_gamma()

    # Setup agents.
    random_agent = RandomAgent(actions)
    rmax_agent = RMaxAgent(actions, gamma=gamma)
    qlearner_agent = QLearnerAgent(actions, gamma=gamma)
    lin_approx_agent = LinearApproxQLearnerAgent(actions, gamma=gamma)
    grad_boost_agent = GradientBoostingAgent(actions,
                                             gamma=gamma,
                                             explore="softmax")

    # Choose agents.
    agents = [lin_approx_agent, random_agent]

    # Run experiments.
    run_agents_on_mdp(agents, mdp)
def main(eps=0.1, open_plot=True):

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class=mdp_class)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy,
                                            name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = avg_mdp_vi.get_q_function()

    if alg == "q":
        pure_ql_agent = QLearnerAgent(actions, epsilon=eps, name="Q-0")
        qmax = 1.0 * (1 - 0.99)
        # qmax = 1.0
        pure_ql_agent_opt = QLearnerAgent(actions,
                                          epsilon=eps,
                                          default_q=qmax,
                                          name="Q-vmax")
        transfer_ql_agent_optq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-max")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)
        transfer_ql_agent_avgq = QLearnerAgent(actions,
                                               epsilon=eps,
                                               name="Q-trans-avg")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            pure_ql_agent, pure_ql_agent_opt, transfer_ql_agent_optq,
            transfer_ql_agent_avgq
        ]
    elif alg == "rmax":
        pure_rmax_agent = RMaxAgent(actions, name="RMAX-vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(actions,
                                                      name="RMAX-updating_max")
        trans_rmax_agent = RMaxAgent(actions, name="RMAX-trans_max")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [pure_rmax_agent, updating_trans_rmax_agent, trans_rmax_agent]
    elif alg == "delayed-q":
        pure_delayed_ql_agent = DelayedQLearnerAgent(actions,
                                                     opt_q_func,
                                                     name="DelayedQ-vmax")
        pure_delayed_ql_agent.set_vmax()
        updating_delayed_ql_agent = UpdatingDelayedQLearnerAgent(
            actions, name="DelayedQ-updating_max")
        trans_delayed_ql_agent = DelayedQLearnerAgent(
            actions, opt_q_func, name="DelayedQ-trans-max")
        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent
        ]
    else:
        print "Unknown type of agents:", alg
        print "(q, rmax, delayed-q)"
        assert (False)

    # Run task.
    # TODO: Function for Learning on each MDP
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=100,
                          reset_at_terminal=is_goal_terminal,
                          is_rec_disc_reward=False,
                          cumulative_plot=True,
                          open_plot=open_plot)
示例#11
0
def get_exact_vs_approx_agents(environment, incl_opt=True):
    '''
    Args:
        environment (simple_rl.MDPDistribution)
        incl_opt (bool)

    Returns:
        (list)
    '''

    actions = environment.get_actions()
    gamma = environment.get_gamma()

    exact_qds_test = get_sa(environment,
                            indic_func=ind_funcs._q_eps_approx_indicator,
                            epsilon=0.0)
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    ql_exact_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    ql_approx_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent]

    dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    dql_exact_agent = AbstractionWrapper(DoubleQAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=exact_qds_test,
                                         name_ext="-exact")
    dql_approx_agent = AbstractionWrapper(DoubleQAgent,
                                          agent_params={"actions": actions},
                                          state_abstr=approx_qds_test,
                                          name_ext="-approx")
    dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent]

    rm_agent = RMaxAgent(actions, gamma=gamma)
    rm_exact_agent = AbstractionWrapper(RMaxAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    rm_approx_agent = AbstractionWrapper(RMaxAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent]

    if incl_opt:
        vi = ValueIteration(environment)
        vi.run_vi()
        opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$")

        sa_vi = AbstractValueIteration(
            environment,
            sample_rate=50,
            max_iterations=3000,
            delta=0.0001,
            state_abstr=approx_qds_test,
            action_abstr=ActionAbstraction(
                options=[], prim_actions=environment.get_actions()))
        sa_vi.run_vi()
        approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$")

        dql_agents += [opt_agent, approx_opt_agent]

    return ql_agents
示例#12
0
from simple_rl.agents import QLearningAgent, RandomAgent, RMaxAgent
from simple_rl.tasks import GridWorldMDP
from simple_rl.run_experiments import run_agents_on_mdp

# Setup MDP.
mdp = GridWorldMDP(width=4,
                   height=3,
                   init_loc=(1, 1),
                   goal_locs=[(4, 3)],
                   lava_locs=[(4, 2)],
                   gamma=0.95,
                   walls=[(2, 2)],
                   slip_prob=0.05)

# Setup Agents.
ql_agent = QLearningAgent(actions=mdp.get_actions())
rmax_agent = RMaxAgent(actions=mdp.get_actions())
rand_agent = RandomAgent(actions=mdp.get_actions())

# Run experiment and make plot.
run_agents_on_mdp([ql_agent, rmax_agent, rand_agent],
                  mdp,
                  instances=5,
                  episodes=50,
                  steps=10)
def main():

    # Setup environment.
    mdp_class, agent_type, samples = parse_args()
    is_goal_terminal = False
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute priors.

    # Stochastic mixture.
    mdp_distr_copy = copy.deepcopy(mdp_distr)
    opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy)

    # Avg MDP
    avg_mdp = ape.compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    # Make agents.

    # Q Learning
    ql_agent = QLearnerAgent(actions)
    shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy,
                                         actions=actions,
                                         name="Prior-QLearning")
    shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy,
                                          actions=actions,
                                          name="AvgMDP-QLearning")

    # RMax
    rmax_agent = RMaxAgent(actions)
    shaped_rmax_agent_prior = ShapedRMaxAgent(
        shaping_policy=opt_stoch_policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="Prior-RMax")
    shaped_rmax_agent_avgmdp = ShapedRMaxAgent(
        shaping_policy=avg_mdp_vi.policy,
        state_space=avg_mdp_vi.get_states(),
        actions=actions,
        name="AvgMDP-RMax")
    prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr)

    if agent_type == "rmax":
        agents = [
            rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp,
            prune_rmax_agent
        ]
    else:
        agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp]

    # Run task.
    run_agents_multi_task(agents,
                          mdp_distr,
                          task_samples=samples,
                          episodes=1,
                          steps=200,
                          is_rec_disc_reward=False,
                          verbose=True)