예제 #1
0
def main():

    # Grab experiment params.
    mdp = BadChainMDP(gamma=0.95, kappa=0.001)
    actions = mdp.get_actions()

    # =======================
    # == Make Abstractions ==
    # =======================
    sa_q_eps = get_sa(mdp,
                      indic_func=indicator_funcs._q_eps_approx_indicator,
                      epsilon=0.1)

    # RMax Agents.
    rmax_agent = RMaxAgent(actions)
    abstr_rmax_agent = AbstractionWrapper(RMaxAgent,
                                          state_abstr=sa_q_eps,
                                          agent_params={"actions": actions},
                                          name_ext="-$\\phi_{Q_\\epsilon^*}$")

    # Delayed Q Agents.
    del_q_agent = DelayedQAgent(actions)
    abstr_del_q_agent = AbstractionWrapper(DelayedQAgent,
                                           state_abstr=sa_q_eps,
                                           agent_params={"actions": actions},
                                           name_ext="-$\\phi_{Q_\\epsilon^*}$")

    run_agents_on_mdp(
        [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent],
        mdp,
        instances=50,
        steps=250,
        episodes=1)
예제 #2
0
def get_combo_experiment_agents(environment):
    '''
    Args:
        environment (simple_rl.MDPDistribution)

    Returns:
        (list)
    '''
    actions = environment.get_actions()
    gamma = environment.get_gamma()

    sa, aa = get_directed_option_sa_pair(
        environment,
        indic_func=ind_funcs._q_disc_approx_indicator,
        max_options=100)
    sa_qds_test = get_sa(environment,
                         indic_func=ind_funcs._q_disc_approx_indicator,
                         epsilon=0.05)
    sa_qs_test = get_sa(environment,
                        indic_func=ind_funcs._q_eps_approx_indicator,
                        epsilon=0.1)

    # QLearner.
    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)

    # Combos.
    ql_sa_qds_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=sa_qds_test,
                                         name_ext="$\phi_{Q_d^*}$")
    ql_sa_qs_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=sa_qs_test,
                                        name_ext="$\phi_{Q_\epsilon^*}$")

    # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa")
    aa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={"actions": actions},
                                  action_abstr=aa,
                                  name_ext="aa")
    sa_aa_agent = AbstractionWrapper(QLearningAgent,
                                     agent_params={"actions": actions},
                                     state_abstr=sa,
                                     action_abstr=aa,
                                     name_ext="$\phi_{Q_d^*}+aa$")

    agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent]

    return agents
예제 #3
0
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy (lambda : simple_rl.State --> str)
        beta_list (list)
        is_deterministic_ib (bool)

    Summary:
        Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction.
    '''
    # Run info_sa.
    dict_of_phi_pmfs = {}
    for beta in beta_list:
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib)

        # Translate abstractions.
        prob_s_phi = ProbStateAbstraction(phi_pmf)
        crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
        #ground state to abstract state
        dict_of_phi_pmfs[beta] = crisp_s_phi
        print("crisp_s_phi:" )
        for single_state in crisp_s_phi.get_abs_states():
            print(str(type(single_state)))
            print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state)))
        print("ground states:")
        for ground_states in crisp_s_phi.get_ground_states():
            print(str(type(ground_states)))
        print(len(crisp_s_phi.get_ground_states()))
        print(len(crisp_s_phi.get_abs_states()))

    # Make agents.
    demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$")
    ql_agent = QLearningAgent(mdp.get_actions())
    agent_dict = {}
    for beta in beta_list:
        beta_phi = dict_of_phi_pmfs[beta]
        ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True})
        agent_dict[beta] = ql_abstr_agent

    # Learn.
    run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5)

    # Print num abstract states.
    for beta in dict_of_phi_pmfs.keys():
        print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states()
    print
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params={}
    params['multitask']=False
    params['env_name']="LunarLander-v2"
    params['obs_size']=8
    params['num_iterations_for_abstraction_learning']=500
    params['learning_rate_for_abstraction_learning']=0.005
    params['abstraction_network_hidden_layers']=2
    params['abstraction_network_hidden_nodes']=200
    params['num_samples_from_demonstrator']=10000
    params['episodes']=200
    params['steps']=1000
    params['num_instances']=5
    params['rl_learning_rate']=0.005
    mdp_demo_policy_dict = {}
    env_name = "LunarLander-v2"
    env_gym = gym.make(env_name)
    obs_size = len(env_gym.observation_space.high)
    env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20)
    test_mdp = env #test mdp is the same
    mdp_demo_policy_dict[env]=lpd.expert_lunar_policy

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "lunar_nn_sa"
    num_iterations = 300
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()

    # ============================
    # == Make test and train environments
    # == along with demonstrator(s)
    # ============================
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
        multitask=params['multitask'])
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions, num_features=num_features)
    sa_agent = AbstractionWrapper(
        QLearningAgent,
        agent_params={"actions": test_mdp.get_actions()},
        state_abstr=nn_sa,
        name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================
    run_agents_on_mdp([sa_agent, linear_agent],
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)
예제 #6
0
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = rlec.get_cartpole_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    params['num_iterations_for_abstraction_learning'] = 500
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # ====================================
    # == Visualize Abstract State Space ==
    # ====================================

    # Collect dataset based on learner.
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")
    #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent)
    visited_states = collect_samples_from_demo_policy_random_s0_cartpole(
        mdp_demo_policy_dict, num_samples=2000)

    # Get feature indices.
    features = get_feature_dicts()

    # Visualize.
    vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
예제 #7
0
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool): If True, run DIB, else IB.
        is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead.

    Summary:
        Runs info_sa and compares the value of the found policy with the demonstrator policy.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    # Make demonstrator agent and random agent.
    demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$")
    rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$")

    # Make abstract agent.
    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
    abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="")
    
    # Run.
    run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000)


    non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0]
    # Print state space sizes.
    demo_vi = ValueIteration(mdp)
    print "\nState Spaces Sizes:"
    print "\t|S| =", demo_vi.get_num_states()
    print "\tH(S_\\phi) =", entropy(pmf_s_phi)
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()
    print "\tdelta_min =", min(non_zero_abstr_states)
    print "\tnum non zero states =", len(non_zero_abstr_states)
    print
def diff_sampling_distr_experiment():
    '''
    Summary:
        Compares performance of different sample styles to compute phi.
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False)
    expert_puddle_policy = ppd.get_demo_policy_given_goal(
        test_mdp.get_goal_locs()[0])
    demo_agent = FixedPolicyAgent(expert_puddle_policy)

    # Make a NN for each sampling param.
    agents = {}
    sess = tf.Session()
    sampling_params = [0.0, 0.5, 1.0]

    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False,
                                         sample_type="demo")
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(
                QLearningAgent,
                agent_params={
                    "actions":
                    test_mdp.get_actions(),
                    "name":
                    "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) +
                    "$"
                },
                state_abstr=nn_sa,
                name_ext="")
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions":
                                               test_mdp.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
def num_training_data_experiment():
    '''
    Summary:
        Runs an experiment that compares the performance of different
        Agent-SA combinations, where each SA is trained with a different
        number of training samples.
    '''
    # Params.
    instances = 10
    init, increment, maximum = 1, 500, 5001
    training_samples = range(init, maximum, increment)

    # Run experiment.s
    if not os.path.exists(os.path.join("results", "puddle_per_sample")):
        os.makedirs(os.path.join("results", "puddle_per_sample"))
    data_dir = os.path.join("results", "puddle_per_sample")
    with open(os.path.join(data_dir, "results.csv"), "w+") as results_file:

        # Repeat the experiment @instances # times.
        for i in range(instances):
            print "\nInstances", i + 1, "of", str(instances)
            for sample_num in training_samples:
                print "\tSamples:", sample_num

                # Make State Abstraction.
                params = get_params(default_params={
                    "num_samples_from_demonstrator": sample_num
                })
                mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(
                    multitask=params['multitask'])
                expert_puddle_policy = ppd.get_demo_policy_given_goal(
                    test_mdp.get_goal_locs()[0])
                demo_agent = FixedPolicyAgent(expert_puddle_policy)
                tf.reset_default_graph()
                sess = tf.Session()
                abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                             sess,
                                             params,
                                             verbose=False)
                nn_sa = NNStateAbstr(abstraction_net)

                # Test Performance with given param.
                sa_agent = AbstractionWrapper(
                    QLearningAgent,
                    agent_params={"actions": test_mdp.get_actions()},
                    state_abstr=nn_sa,
                    name_ext="$-\\phi$")
                val = evaluate_agent(sa_agent,
                                     test_mdp,
                                     steps=params['steps'],
                                     episodes=params['episodes'])
                results_file.write(str(val) + ",")
                results_file.flush()
                sess.close()

            results_file.write("\n")

    cu.EVERY_OTHER_X = True
    cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance"
    cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$"
    cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode"
    cu.X_AXIS_START_VAL = init
    cu.X_AXIS_INCREMENT = increment
    cu.COLOR_SHIFT = 3
    cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
def main():

    # Grab experiment params.
    mdp_class, task_samples, episodes, steps, grid_dim, x_axis_num_options, agent_class_str, max_options, exp_type = parse_args(
    )

    gamma = 0.9

    # ========================
    # === Make Environment ===
    # ========================
    multi_task = True
    max_option_steps = 50 if x_axis_num_options else 0
    environment = make_mdp.make_mdp_distr(
        mdp_class=mdp_class,
        grid_dim=grid_dim) if multi_task else make_mdp.make_mdp(
            mdp_class=mdp_class)
    actions = environment.get_actions()
    environment.set_gamma(gamma)

    # Indicator functions.
    v_indic = ind_funcs._v_approx_indicator
    q_indic = ind_funcs._q_eps_approx_indicator
    v_disc_indic = ind_funcs._v_disc_approx_indicator
    rand_indic = ind_funcs._random

    # =========================
    # === Make Abstractions ===
    # =========================

    # Directed Variants.
    v_directed_sa, v_directed_aa = get_abstractions(environment,
                                                    v_disc_indic,
                                                    directed=True,
                                                    max_options=max_options)
    # v_directed_sa, v_directed_aa = get_abstractions(environment, v_indic, directed=True, max_options=max_options)

    # Identity action abstraction.
    identity_sa, identity_aa = get_sa(environment,
                                      default=True), get_aa(environment,
                                                            default=True)

    if exp_type == "core":
        # Core only abstraction types.
        q_directed_sa, q_directed_aa = get_abstractions(
            environment, q_indic, directed=True, max_options=max_options)
        rand_directed_sa, rand_directed_aa = get_abstractions(
            environment, rand_indic, directed=True, max_options=max_options)
        pblocks_sa, pblocks_aa = get_sa(
            environment,
            default=True), action_abs.aa_baselines.get_policy_blocks_aa(
                environment, incl_prim_actions=True, num_options=max_options)

    # ===================
    # === Make Agents ===
    # ===================

    # Base Agents.
    agent_class = QLearningAgent if agent_class_str == "ql" else RMaxAgent
    rand_agent = RandomAgent(actions)
    baseline_agent = agent_class(actions, gamma=gamma)

    if mdp_class == "pblocks":
        baseline_agent.epsilon = 0.01

    # Abstraction Extensions.
    agents = []
    vabs_agent_directed = AbstractionWrapper(agent_class,
                                             actions,
                                             str(environment),
                                             max_option_steps=max_option_steps,
                                             state_abstr=v_directed_sa,
                                             action_abstr=v_directed_aa,
                                             name_ext="v-sa+aa")

    if exp_type == "core":
        # Core only agents.
        qabs_agent_directed = AbstractionWrapper(
            agent_class,
            actions,
            str(environment),
            max_option_steps=max_option_steps,
            state_abstr=q_directed_sa,
            action_abstr=q_directed_aa,
            name_ext="q-sa+aa")
        rabs_agent_directed = AbstractionWrapper(
            agent_class,
            actions,
            str(environment),
            max_option_steps=max_option_steps,
            state_abstr=rand_directed_sa,
            action_abstr=rand_directed_aa,
            name_ext="rand-sa+aa")
        pblocks_agent = AbstractionWrapper(agent_class,
                                           actions,
                                           str(environment),
                                           max_option_steps=max_option_steps,
                                           state_abstr=pblocks_sa,
                                           action_abstr=pblocks_aa,
                                           name_ext="pblocks")
        agents = [
            vabs_agent_directed, qabs_agent_directed, rabs_agent_directed,
            pblocks_agent, baseline_agent
        ]
    elif exp_type == "combo":
        # Combo only agents.
        aa_agent = AbstractionWrapper(agent_class,
                                      actions,
                                      str(environment),
                                      max_option_steps=max_option_steps,
                                      state_abstr=identity_sa,
                                      action_abstr=v_directed_aa,
                                      name_ext="aa")
        sa_agent = AbstractionWrapper(agent_class,
                                      actions,
                                      str(environment),
                                      max_option_steps=max_option_steps,
                                      state_abstr=v_directed_sa,
                                      action_abstr=identity_aa,
                                      name_ext="sa")
        agents = [vabs_agent_directed, sa_agent, aa_agent, baseline_agent]

    # Run experiments.
    if multi_task:
        steps = 999999 if x_axis_num_options else steps
        run_agents_multi_task(agents,
                              environment,
                              task_samples=task_samples,
                              steps=steps,
                              episodes=episodes,
                              reset_at_terminal=True)
    else:
        run_agents_on_mdp(agents,
                          environment,
                          instances=20,
                          episodes=30,
                          reset_at_terminal=True)
예제 #11
0
def get_sa_experiment_agents(environment, AgentClass, pac=False):
    '''
    Args:
        environment (simple_rl.MDPDistribution)
        AgentClass (Class)

    Returns:
        (list)
    '''
    actions = environment.get_actions()
    gamma = environment.get_gamma()

    if pac:
        # PAC State Abstractions.
        sa_qds_test = compute_pac_sa(
            environment,
            indic_func=ind_funcs._q_disc_approx_indicator,
            epsilon=0.2)
        sa_qs_test = compute_pac_sa(
            environment,
            indic_func=ind_funcs._q_eps_approx_indicator,
            epsilon=0.2)
        sa_qs_exact_test = compute_pac_sa(
            environment,
            indic_func=ind_funcs._q_eps_approx_indicator,
            epsilon=0.0)

    else:
        # Compute state abstractions.
        sa_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_disc_approx_indicator,
                             epsilon=0.1)
        sa_qs_test = get_sa(environment,
                            indic_func=ind_funcs._q_eps_approx_indicator,
                            epsilon=0.1)
        sa_qs_exact_test = get_sa(environment,
                                  indic_func=ind_funcs._q_eps_approx_indicator,
                                  epsilon=0.0)

    # Make Agents.
    agent = AgentClass(actions, gamma=gamma)
    params = {
        "actions": actions
    } if AgentClass is not RMaxAgent else {
        "actions": actions,
        "s_a_threshold": 2,
        "horizon": 5
    }
    sa_qds_agent = AbstractionWrapper(AgentClass,
                                      agent_params=params,
                                      state_abstr=sa_qds_test,
                                      name_ext="$-\phi_{Q_d^*}$")
    sa_qs_agent = AbstractionWrapper(AgentClass,
                                     agent_params=params,
                                     state_abstr=sa_qs_test,
                                     name_ext="$-\phi_{Q_\epsilon^*}$")
    sa_qs_exact_agent = AbstractionWrapper(AgentClass,
                                           agent_params=params,
                                           state_abstr=sa_qs_exact_test,
                                           name_ext="-$\phi_{Q^*}$")

    agents = [agent, sa_qds_agent, sa_qs_agent, sa_qs_exact_agent]

    # if isinstance(environment.sample(), FourRoomMDP) or isinstance(environment.sample(), ColorMDP):
    #     # If it's a fourroom add the handcoded one.
    #     sa_hand_test = get_sa(environment, indic_func=ind_funcs._four_rooms)
    #     sa_hand_agent = AbstractionWrapper(AgentClass, agent_params=params, state_abstr=sa_hand_test, name_ext="$-\phi_h$")
    #     agents += [sa_hand_agent]

    return agents
예제 #12
0
def get_exact_vs_approx_agents(environment, incl_opt=True):
    '''
    Args:
        environment (simple_rl.MDPDistribution)
        incl_opt (bool)

    Returns:
        (list)
    '''

    actions = environment.get_actions()
    gamma = environment.get_gamma()

    exact_qds_test = get_sa(environment,
                            indic_func=ind_funcs._q_eps_approx_indicator,
                            epsilon=0.0)
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    ql_exact_agent = AbstractionWrapper(QLearningAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    ql_approx_agent = AbstractionWrapper(QLearningAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent]

    dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05)
    dql_exact_agent = AbstractionWrapper(DoubleQAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=exact_qds_test,
                                         name_ext="-exact")
    dql_approx_agent = AbstractionWrapper(DoubleQAgent,
                                          agent_params={"actions": actions},
                                          state_abstr=approx_qds_test,
                                          name_ext="-approx")
    dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent]

    rm_agent = RMaxAgent(actions, gamma=gamma)
    rm_exact_agent = AbstractionWrapper(RMaxAgent,
                                        agent_params={"actions": actions},
                                        state_abstr=exact_qds_test,
                                        name_ext="-exact")
    rm_approx_agent = AbstractionWrapper(RMaxAgent,
                                         agent_params={"actions": actions},
                                         state_abstr=approx_qds_test,
                                         name_ext="-approx")
    rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent]

    if incl_opt:
        vi = ValueIteration(environment)
        vi.run_vi()
        opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$")

        sa_vi = AbstractValueIteration(
            environment,
            sample_rate=50,
            max_iterations=3000,
            delta=0.0001,
            state_abstr=approx_qds_test,
            action_abstr=ActionAbstraction(
                options=[], prim_actions=environment.get_actions()))
        sa_vi.run_vi()
        approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$")

        dql_agents += [opt_agent, approx_opt_agent]

    return ql_agents
def diff_sampling_distr_experiment():
    '''
    Summary:
        Runs
    '''
    # Make MDP and Demo Policy.
    params = get_params()
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy
    demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)

    # Make a NN for each sampling param.
    sampling_params = [0.0, 0.5, 1.0]

    test_mdp = CartPoleMDP()  #
    agents = {"demo": demo_agent}
    sess = tf.Session()
    for epsilon in sampling_params:
        with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope:
            print "epsilon", epsilon
            # tf.reset_default_graph()
            params["epsilon"] = epsilon
            abstraction_net = make_nn_sa(mdp_demo_policy_dict,
                                         sess,
                                         params,
                                         verbose=False)
            nn_sa = NNStateAbstr(abstraction_net)
            sa_agent = AbstractionWrapper(QLearningAgent,
                                          agent_params={
                                              "actions":
                                              env.get_actions(),
                                              "name":
                                              "$QL_\\phi-\\epsilon=" +
                                              str(epsilon) + "$"
                                          },
                                          state_abstr=nn_sa)
            agents[epsilon] = sa_agent

    with tf.variable_scope('demo') as scope:
        abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict,
                                          sess,
                                          params,
                                          verbose=False,
                                          sample_type="rand")
        nn_sa_rand = NNStateAbstr(abstraction_net_rand)
        sa_agent_rand = AbstractionWrapper(QLearningAgent,
                                           agent_params={
                                               "actions": env.get_actions(),
                                               "name": "$D \\sim U(S)$"
                                           },
                                           state_abstr=nn_sa_rand,
                                           name_ext="")
        agents["rand"] = sa_agent_rand

    run_agents_on_mdp(agents.values(),
                      test_mdp,
                      instances=params['num_instances'],
                      episodes=params['episodes'],
                      steps=params['steps'],
                      verbose=False)

    sess.close()
def main():

    # ======================
    # == Make Environment ==
    # ======================
    params = get_params()
    num_test_mdps = 6  # 6 is max.
    mdp_demo_policy_dict = {}
    env = GymMDP(env_name='CartPole-v0')
    obs_size = env.get_num_state_feats()
    mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy

    if params['multitask']:
        # Make distribution.
        mdp_dist_dict = {
            CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps
            for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps]
        }
        test_mdp = MDPDistribution(mdp_dist_dict)
    else:
        test_mdp = CartPoleMDP()

    # ============================
    # == Make State Abstraction ==
    # ============================
    sess = tf.Session()
    nn_sa_file_name = "cartpole_nn_sa"
    abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params)
    nn_sa = NNStateAbstr(abstraction_net)

    # =================
    # == Make Agents ==
    # =================
    actions = test_mdp.get_actions()
    num_features = test_mdp.get_num_state_feats()
    linear_agent = LinearQAgent(actions=actions,
                                num_features=num_features,
                                alpha=params['rl_learning_rate'])
    sa_agent = AbstractionWrapper(QLearningAgent,
                                  agent_params={
                                      "alpha": params['rl_learning_rate'],
                                      "epsilon": 0.2,
                                      "actions": test_mdp.get_actions()
                                  },
                                  state_abstr=nn_sa,
                                  name_ext="$-\\phi$")

    # ====================
    # == Run Experiment ==
    # ====================

    if params['multitask']:
        run_agents_lifelong([sa_agent, linear_agent],
                            test_mdp,
                            samples=params['num_instances'],
                            episodes=params['episodes'],
                            steps=params['steps'],
                            verbose=False)
    else:
        # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy)
        run_agents_on_mdp([sa_agent, linear_agent],
                          test_mdp,
                          instances=params['num_instances'],
                          episodes=params['episodes'],
                          steps=params['steps'],
                          verbose=False)
예제 #15
0
def _info_sa_val_and_size_plot_wrapper(beta, param_dict):
    '''
    Args:
        beta (float): stands for $beta$ in the info_sa algorithm.
        param_dict (dict): contains relevant parameters for plotting.

    Returns:
        (tuple):
            (int) The value achieved by $pi_phi^*$ in the MDP.
            (int) The number of abstract states

    Notes:
        This serves as a wrapper to cooperate with PlotFunc.
    '''

    # Grab params.
    mdp = param_dict["mdp"]
    demo_policy_lambda = param_dict["demo_policy_lambda"]
    iters = param_dict["iters"]
    convergence_threshold = param_dict["convergence_threshold"]
    is_deterministic_ib = param_dict["is_deterministic_ib"]
    use_crisp_policy = param_dict["use_crisp_policy"]
    is_agent_in_control = param_dict["is_agent_in_control"]

    # --- Run DIBS to convergence ---
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        import agent_in_control
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(
            mdp,
            demo_policy_lambda,
            beta=beta,
            is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        from info_sa import run_info_sa
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(
            mdp,
            demo_policy_lambda,
            iters=50,
            beta=beta,
            convergence_threshold=0.00001,
            is_deterministic_ib=is_deterministic_ib)

    print "\tEvaluating..."
    # Make abstract agent.
    from info_sa import get_lambda_policy

    # Make the policy deterministic if needed.
    if use_crisp_policy:
        from info_sa import make_policy_det_max_policy
        policy = get_lambda_policy(
            make_policy_det_max_policy(abstr_policy_pmf))
    else:
        policy = get_lambda_policy(abstr_policy_pmf)

    prob_s_phi = ProbStateAbstraction(phi_pmf)

    # -- Compute Values --

    phi = convert_prob_sa_to_sa(
        prob_s_phi) if is_deterministic_ib else prob_s_phi
    abstr_agent = AbstractionWrapper(FixedPolicyAgent,
                                     state_abstr=phi,
                                     agent_params={
                                         "policy": policy,
                                         "name": "$\\pi_\\phi$"
                                     },
                                     name_ext="")

    # Compute value of abstract policy w/ coding distribution.
    value = evaluate_agent(agent=abstr_agent, mdp=mdp, instances=100)

    # -- Compute size of S_\phi --
    if is_deterministic_ib:
        s_phi_size = phi.get_num_abstr_states()
    else:
        # TODO: could change this to {s in S : Pr(s) > 0}.
        from rlit_utils import entropy
        s_phi_size = entropy(pmf_s_phi)

    return value, s_phi_size