예제 #1
0
def main():

    # Set Params.
    mdp_class, task_samples, episodes, steps, grid_dim, AgentClass = get_params(
        set_manually=False)
    experiment_type = "sa"
    lifelong = True
    resample_at_terminal = False
    reset_at_terminal = False
    gamma = 0.95

    # ======================
    # == Make Environment ==
    # ======================
    environment = make_mdp.make_mdp_distr(
        mdp_class=mdp_class,
        grid_dim=grid_dim) if lifelong else make_mdp.make_mdp(
            mdp_class=mdp_class, grid_dim=grid_dim)
    environment.set_gamma(gamma)

    # =================
    # == Make Agents ==
    # =================
    agents = []
    if experiment_type == "sa":
        # SA experiment.
        agents = get_sa_experiment_agents(environment, AgentClass)
    elif experiment_type == "combo":
        # AA experiment.
        agents = get_combo_experiment_agents(environment)
    elif experiment_type == "exact_v_approx":
        agents = get_exact_vs_approx_agents(environment,
                                            incl_opt=(not multi_task))
    elif experiment_type == "opt":
        agents = get_optimal_policies(environment)
    else:
        print "Experiment Error: experiment type unknown (" + experiment_type + "). Must be one of {sa, combo, exact_v_approx}."
        quit()

    # Run!
    if lifelong:
        run_agents_lifelong(agents,
                            environment,
                            samples=task_samples,
                            steps=steps,
                            episodes=episodes,
                            reset_at_terminal=reset_at_terminal,
                            resample_at_terminal=resample_at_terminal,
                            cumulative_plot=True,
                            clear_old_results=True)
    else:
        run_agents_on_mdp(agents,
                          environment,
                          instances=task_samples,
                          steps=steps,
                          episodes=episodes,
                          reset_at_terminal=reset_at_terminal,
                          track_disc_reward=False)
def main(open_plot=True):
    # Make MDP distribution, agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")

    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent], mdp_distr, samples=10, episodes=50, steps=100, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Make goal-based option agent.
    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options)
    option_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions":mdp_distr.get_actions()}, action_abstr=goal_based_aa)

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def main():

    # Make MDP Distribution.
    mdp_distr = make_mdp_distr(mdp_class="four_room",
                               grid_dim=11,
                               slip_prob=0.05,
                               gamma=0.99)

    # Make SA.
    multitask_sa_beta_1 = make_multitask_sa_info_sa(mdp_distr,
                                                    beta=1.0,
                                                    is_deterministic_ib=True)
    multitask_sa_beta_10 = make_multitask_sa_info_sa(mdp_distr,
                                                     beta=10.0,
                                                     is_deterministic_ib=True)
    multitask_sa_beta_100 = make_multitask_sa_info_sa(mdp_distr,
                                                      beta=100.0,
                                                      is_deterministic_ib=True)
    multitask_sa_beta_1000 = make_multitask_sa_info_sa(
        mdp_distr, beta=1000.0, is_deterministic_ib=True)

    # Make agent.
    ql_agent = QLearningAgent(mdp_distr.get_actions())
    abstr_ql_b1 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_1,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 1}$")
    abstr_ql_b10 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_10,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 10}$")
    abstr_ql_b100 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_100,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 100}$")
    abstr_ql_b1000 = AbstractionWrapper(
        QLearningAgent,
        state_abstr=multitask_sa_beta_1000,
        agent_params={"actions": mdp_distr.get_actions()},
        name_ext="-$\\phi_{\\beta = 1000}$")
    run_agents_lifelong(
        [abstr_ql_b1, abstr_ql_b10, abstr_ql_b100, abstr_ql_b1000, ql_agent],
        mdp_distr,
        steps=200,
        samples=50,
        episodes=200)
예제 #5
0
def main():
    from agents import OptimalBeliefAgentClass

    # Setup multitask setting.
    # R ~ D : Puddle, Rock Sample
    # G ~ D : octo, four_room
    # T ~ D : grid

    mdp_class, is_goal_terminal, samples = parse_args()

    mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal)
    mdp_distr.set_gamma(0.99)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print "Making and solving avg MDP...",
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()
    print "done." #, iters, value
    sys.stdout.flush()

    # Agents.
    print "Making agents...",
    sys.stdout.flush()
    mdp_distr_copy = copy.deepcopy(mdp_distr)

    # Add additional agent:

    opt_stoch_policy = compute_optimal_stoch_policy(mdp_distr_copy)
    opt_stoch_policy_agent = FixedPolicyAgent(opt_stoch_policy, name="$\pi_{prior}$")
    opt_belief_agent = OptimalBeliefAgentClass.OptimalBeliefAgent(mdp_distr, actions)
    vi_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="$\pi_{avg}$")
    rand_agent = RandomAgent(actions, name="$\pi^u$")
    ql_agent = QLearningAgent(actions)
    print "done."
    
    agents = [vi_agent, opt_stoch_policy_agent, rand_agent, opt_belief_agent]

    # Run task.
    run_agents_lifelong(agents, mdp_distr, samples=samples, episodes=1, steps=100, reset_at_terminal=False, track_disc_reward=False, cumulative_plot=True)
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Make goal-based option agent.
    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(),
                                      options=goal_based_options)
    option_agent = AbstractionWrapper(QLearningAgent,
                                      actions=mdp_distr.get_actions(),
                                      action_abstr=goal_based_aa)

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent, option_agent],
                        mdp_distr,
                        samples=10,
                        episodes=100,
                        steps=150,
                        open_plot=open_plot)
예제 #7
0
def main(open_plot=True):
    episodes = 100
    steps = 100
    gamma = 0.95

    mdp_class, is_goal_terminal, samples, alg = parse_args()

    # Setup multitask setting.
    mdp_distr = make_mdp_distr(mdp_class=mdp_class,
                               is_goal_terminal=is_goal_terminal,
                               gamma=gamma)
    actions = mdp_distr.get_actions()

    # Compute average MDP.
    print("Making and solving avg MDP...", end='')
    sys.stdout.flush()
    avg_mdp = compute_avg_mdp(mdp_distr)
    avg_mdp_vi = ValueIteration(avg_mdp,
                                delta=0.001,
                                max_iterations=1000,
                                sample_rate=5)
    iters, value = avg_mdp_vi.run_vi()

    ### Yuu

    # transfer_fixed_agent = FixedPolicyAgent(avg_mdp_vi.policy, name="transferFixed")
    rand_agent = RandomAgent(actions, name="$\\pi^u$")

    opt_q_func = compute_optimistic_q_function(mdp_distr)
    avg_q_func = get_q_func(avg_mdp_vi)

    best_v = -100  # Maximum possible value an agent can get in the environment.
    for x in opt_q_func:
        for y in opt_q_func[x]:
            best_v = max(best_v, opt_q_func[x][y])
    print("Vmax =", best_v)
    vmax = best_v

    vmax_func = defaultdict(lambda: defaultdict(lambda: vmax))

    if alg == "q":
        eps = 0.1
        lrate = 0.1
        pure_ql_agent = QLearningAgent(actions,
                                       gamma=gamma,
                                       alpha=lrate,
                                       epsilon=eps,
                                       name="Q-0")
        pure_ql_agent_opt = QLearningAgent(actions,
                                           gamma=gamma,
                                           alpha=lrate,
                                           epsilon=eps,
                                           default_q=vmax,
                                           name="Q-Vmax")
        ql_agent_upd_maxq = UpdatingQLearnerAgent(actions,
                                                  alpha=lrate,
                                                  epsilon=eps,
                                                  gamma=gamma,
                                                  default_q=vmax,
                                                  name="Q-MaxQInit")

        transfer_ql_agent_optq = QLearningAgent(actions,
                                                gamma=gamma,
                                                alpha=lrate,
                                                epsilon=eps,
                                                name="Q-UO")
        transfer_ql_agent_optq.set_init_q_function(opt_q_func)

        transfer_ql_agent_avgq = QLearningAgent(actions,
                                                gamma=gamma,
                                                alpha=lrate,
                                                epsilon=eps,
                                                name="Q-AverageQInit")
        transfer_ql_agent_avgq.set_init_q_function(avg_q_func)

        agents = [
            transfer_ql_agent_optq, ql_agent_upd_maxq, transfer_ql_agent_avgq,
            pure_ql_agent_opt, pure_ql_agent
        ]
    elif alg == "rmax":
        """
        Note that Rmax is a model-based algorithm and is very slow compared to other model-free algorithms like Q-learning and delayed Q-learning.
        """
        known_threshold = 10
        min_experience = 5
        pure_rmax_agent = RMaxAgent(actions,
                                    gamma=gamma,
                                    horizon=known_threshold,
                                    s_a_threshold=min_experience,
                                    name="RMAX-Vmax")
        updating_trans_rmax_agent = UpdatingRMaxAgent(
            actions,
            gamma=gamma,
            horizon=known_threshold,
            s_a_threshold=min_experience,
            name="RMAX-MaxQInit")
        trans_rmax_agent = RMaxAgent(actions,
                                     gamma=gamma,
                                     horizon=known_threshold,
                                     s_a_threshold=min_experience,
                                     name="RMAX-UO")
        trans_rmax_agent.set_init_q_function(opt_q_func)
        agents = [
            trans_rmax_agent, updating_trans_rmax_agent, pure_rmax_agent,
            rand_agent
        ]
    elif alg == "delayed-q":
        torelance = 0.1
        min_experience = 5
        pure_delayed_ql_agent = DelayedQAgent(actions,
                                              gamma=gamma,
                                              m=min_experience,
                                              epsilon1=torelance,
                                              name="DelayedQ-Vmax")
        pure_delayed_ql_agent.set_q_function(vmax_func)
        updating_delayed_ql_agent = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            name="DelayedQ-MaxQInit")
        updating_delayed_ql_agent.set_q_function(vmax_func)
        trans_delayed_ql_agent = DelayedQAgent(actions,
                                               gamma=gamma,
                                               m=min_experience,
                                               epsilon1=torelance,
                                               name="DelayedQ-UO")
        trans_delayed_ql_agent.set_q_function(opt_q_func)

        agents = [
            pure_delayed_ql_agent, updating_delayed_ql_agent,
            trans_delayed_ql_agent, rand_agent
        ]
        # agents = [updating_delayed_ql_agent, trans_delayed_ql_agent, rand_agent]
    elif alg == "sample-effect":
        """
        This runs a comparison of MaxQInit with different number of MDP samples to calculate the initial Q function. Note that the performance of the sampled MDP is ignored for this experiment. It reproduces the result of Figure 4 of "Policy and Value Transfer for Lifelong Reinforcement Learning".
        """
        torelance = 0.1
        min_experience = 5
        pure_delayed_ql_agent = DelayedQAgent(actions,
                                              opt_q_func,
                                              m=min_experience,
                                              epsilon1=torelance,
                                              name="DelayedQ-Vmax")
        pure_delayed_ql_agent.set_vmax()
        dql_60samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=60,
            name="$DelayedQ-MaxQInit60$")
        dql_40samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=40,
            name="$DelayedQ-MaxQInit40$")
        dql_20samples = UpdatingDelayedQLearningAgent(
            actions,
            default_q=vmax,
            gamma=gamma,
            m=min_experience,
            epsilon1=torelance,
            num_sample_tasks=20,
            name="$DelayedQ-MaxQInit20$")

        # Sample MDPs. Note that the performance of the sampled MDP is ignored and not included in the average in the final plot.
        run_agents_lifelong([dql_20samples],
                            mdp_distr,
                            samples=int(samples * 1 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        run_agents_lifelong([dql_40samples],
                            mdp_distr,
                            samples=int(samples * 2 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        run_agents_lifelong([dql_60samples],
                            mdp_distr,
                            samples=int(samples * 3 / 5.0),
                            episodes=episodes,
                            steps=steps,
                            reset_at_terminal=is_goal_terminal,
                            track_disc_reward=False,
                            cumulative_plot=True,
                            open_plot=open_plot)
        # mdp_distr.reset_tasks()
        # agents = [pure_delayed_ql_agent]
        agents = [
            dql_60samples, dql_40samples, dql_20samples, pure_delayed_ql_agent
        ]
    else:
        msg = "Unknown type of agent:" + alg + ". Use -agent_type (q, rmax, delayed-q)"
        assert False, msg

    # Run task.
    run_agents_lifelong(agents,
                        mdp_distr,
                        samples=samples,
                        episodes=episodes,
                        steps=steps,
                        reset_at_terminal=is_goal_terminal,
                        track_disc_reward=False,
                        cumulative_plot=True,
                        open_plot=open_plot)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy

    if params['multitask']:
        # Make distribution.
        mdp_dist_dict = {
            CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps
            for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps]
        }
        test_mdp = MDPDistribution(mdp_dist_dict)
    else:
        test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions,
                                num_features=num_features,
                                alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================

    if params['multitask']:
        run_agents_lifelong([sa_agent, linear_agent],
                            test_mdp,
                            samples=params['num_instances'],
                            episodes=params['episodes'],
                            steps=params['steps'],
                            verbose=False)
    else:
        # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)
        run_agents_on_mdp([sa_agent, linear_agent],
                          test_mdp,
                          instances=params['num_instances'],
                          episodes=params['episodes'],
                          steps=params['steps'],
                          verbose=False)
예제 #9
0
    actions = mdp_distr.keys()[0].actions
    gamma = mdp_distr.keys()[0].gamma

    ql_agent = QLearningAgent(actions, gamma=gamma)

    pblocks_aa = get_policy_blocks_aa(mdp_distr,
                                      num_options=5,
                                      task_samples=20,
                                      incl_prim_actions=True)
    regular_sa = get_sa(mdp_distr, default=True)

    pblocks_ql_agent = AbstractionWrapper(QLearningAgent,
                                          actions,
                                          state_abs=regular_sa,
                                          action_abs=pblocks_aa,
                                          name_ext="aa")

    agents = [pblocks_ql_agent, ql_agent]

    mdp_distr = MDPDistribution(mdp_distr)
    run_agents_lifelong(agents,
                        mdp_distr,
                        task_samples=100,
                        episodes=1,
                        steps=10000)

    from visualize_abstractions import visualize_options_grid

    visualize_options_grid(mdp_distr.sample(1), regular_sa.get_ground_states(),
                           pblocks_aa)