def main(): # Grab experiment params. mdp = BadChainMDP(gamma=0.95, kappa=0.001) actions = mdp.get_actions() # ======================= # == Make Abstractions == # ======================= sa_q_eps = get_sa(mdp, indic_func=indicator_funcs._q_eps_approx_indicator, epsilon=0.1) # RMax Agents. rmax_agent = RMaxAgent(actions) abstr_rmax_agent = AbstractionWrapper(RMaxAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") # Delayed Q Agents. del_q_agent = DelayedQAgent(actions) abstr_del_q_agent = AbstractionWrapper(DelayedQAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") run_agents_on_mdp( [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent], mdp, instances=50, steps=250, episodes=1)
def get_combo_experiment_agents(environment): ''' Args: environment (simple_rl.MDPDistribution) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() sa, aa = get_directed_option_sa_pair( environment, indic_func=ind_funcs._q_disc_approx_indicator, max_options=100) sa_qds_test = get_sa(environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.05) sa_qs_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.1) # QLearner. ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) rmax_agent = RMaxAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) # Combos. ql_sa_qds_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qds_test, name_ext="$\phi_{Q_d^*}$") ql_sa_qs_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa_qs_test, name_ext="$\phi_{Q_\epsilon^*}$") # sa_agent = AbstractionWrapper(QLearningAgent, actions, str(environment), state_abstr=sa, name_ext="sa") aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, action_abstr=aa, name_ext="aa") sa_aa_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=sa, action_abstr=aa, name_ext="$\phi_{Q_d^*}+aa$") agents = [ql_agent, ql_sa_qds_agent, ql_sa_qs_agent, aa_agent, sa_aa_agent] return agents
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False): ''' Args: mdp (simple_rl.MDP) demo_policy (lambda : simple_rl.State --> str) beta_list (list) is_deterministic_ib (bool) Summary: Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction. ''' # Run info_sa. dict_of_phi_pmfs = {} for beta in beta_list: pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib) # Translate abstractions. prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) #ground state to abstract state dict_of_phi_pmfs[beta] = crisp_s_phi print("crisp_s_phi:" ) for single_state in crisp_s_phi.get_abs_states(): print(str(type(single_state))) print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state))) print("ground states:") for ground_states in crisp_s_phi.get_ground_states(): print(str(type(ground_states))) print(len(crisp_s_phi.get_ground_states())) print(len(crisp_s_phi.get_abs_states())) # Make agents. demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$") ql_agent = QLearningAgent(mdp.get_actions()) agent_dict = {} for beta in beta_list: beta_phi = dict_of_phi_pmfs[beta] ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True}) agent_dict[beta] = ql_abstr_agent # Learn. run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5) # Print num abstract states. for beta in dict_of_phi_pmfs.keys(): print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states() print
def main(): # ====================== # == Make Environment == # ====================== params={} params['multitask']=False params['env_name']="LunarLander-v2" params['obs_size']=8 params['num_iterations_for_abstraction_learning']=500 params['learning_rate_for_abstraction_learning']=0.005 params['abstraction_network_hidden_layers']=2 params['abstraction_network_hidden_nodes']=200 params['num_samples_from_demonstrator']=10000 params['episodes']=200 params['steps']=1000 params['num_instances']=5 params['rl_learning_rate']=0.005 mdp_demo_policy_dict = {} env_name = "LunarLander-v2" env_gym = gym.make(env_name) obs_size = len(env_gym.observation_space.high) env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20) test_mdp = env #test mdp is the same mdp_demo_policy_dict[env]=lpd.expert_lunar_policy # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "lunar_nn_sa" num_iterations = 300 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main(): # ====================== # == Make Environment == # ====================== params = get_params() # ============================ # == Make test and train environments # == along with demonstrator(s) # ============================ mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def main(): # ====================== # == Make Environment == # ====================== params = rlec.get_cartpole_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" params['num_iterations_for_abstraction_learning'] = 500 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ==================================== # == Visualize Abstract State Space == # ==================================== # Collect dataset based on learner. sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") #visited_states = vu.collect_dataset(test_mdp, samples=2000) #, learning_agent=sa_agent) visited_states = collect_samples_from_demo_policy_random_s0_cartpole( mdp_demo_policy_dict, num_samples=2000) # Get feature indices. features = get_feature_dicts() # Visualize. vu.visualize_state_abstrs3D(visited_states, features, nn_sa)
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta (float) is_deterministic_ib (bool): If True, run DIB, else IB. is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead. Summary: Runs info_sa and compares the value of the found policy with the demonstrator policy. ''' if is_agent_in_control: # Run info_sa with the agent controlling the MDP. pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) # Make demonstrator agent and random agent. demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$") rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$") # Make abstract agent. lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="") # Run. run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000) non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0] # Print state space sizes. demo_vi = ValueIteration(mdp) print "\nState Spaces Sizes:" print "\t|S| =", demo_vi.get_num_states() print "\tH(S_\\phi) =", entropy(pmf_s_phi) print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states() print "\tdelta_min =", min(non_zero_abstr_states) print "\tnum non zero states =", len(non_zero_abstr_states) print
def diff_sampling_distr_experiment(): ''' Summary: Compares performance of different sample styles to compute phi. ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict(multitask=False) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # Make a NN for each sampling param. agents = {} sess = tf.Session() sampling_params = [0.0, 0.5, 1.0] for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="demo") nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim \\rho_E^\\epsilon, \\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa, name_ext="") agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": test_mdp.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def num_training_data_experiment(): ''' Summary: Runs an experiment that compares the performance of different Agent-SA combinations, where each SA is trained with a different number of training samples. ''' # Params. instances = 10 init, increment, maximum = 1, 500, 5001 training_samples = range(init, maximum, increment) # Run experiment.s if not os.path.exists(os.path.join("results", "puddle_per_sample")): os.makedirs(os.path.join("results", "puddle_per_sample")) data_dir = os.path.join("results", "puddle_per_sample") with open(os.path.join(data_dir, "results.csv"), "w+") as results_file: # Repeat the experiment @instances # times. for i in range(instances): print "\nInstances", i + 1, "of", str(instances) for sample_num in training_samples: print "\tSamples:", sample_num # Make State Abstraction. params = get_params(default_params={ "num_samples_from_demonstrator": sample_num }) mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) tf.reset_default_graph() sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) # Test Performance with given param. sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") val = evaluate_agent(sa_agent, test_mdp, steps=params['steps'], episodes=params['episodes']) results_file.write(str(val) + ",") results_file.flush() sess.close() results_file.write("\n") cu.EVERY_OTHER_X = True cu.CUSTOM_TITLE = "Effect of $|D_{train, \\phi}|$ on RL Performance" cu.X_AXIS_LABEL = "$|D_{train, \\phi}|$" cu.Y_AXIS_LABEL = "Avg. Reward in Last Episode" cu.X_AXIS_START_VAL = init cu.X_AXIS_INCREMENT = increment cu.COLOR_SHIFT = 3 cu.format_and_make_plot(data_dir=data_dir, avg_plot=True, add_legend=False)
def main(): # Grab experiment params. mdp_class, task_samples, episodes, steps, grid_dim, x_axis_num_options, agent_class_str, max_options, exp_type = parse_args( ) gamma = 0.9 # ======================== # === Make Environment === # ======================== multi_task = True max_option_steps = 50 if x_axis_num_options else 0 environment = make_mdp.make_mdp_distr( mdp_class=mdp_class, grid_dim=grid_dim) if multi_task else make_mdp.make_mdp( mdp_class=mdp_class) actions = environment.get_actions() environment.set_gamma(gamma) # Indicator functions. v_indic = ind_funcs._v_approx_indicator q_indic = ind_funcs._q_eps_approx_indicator v_disc_indic = ind_funcs._v_disc_approx_indicator rand_indic = ind_funcs._random # ========================= # === Make Abstractions === # ========================= # Directed Variants. v_directed_sa, v_directed_aa = get_abstractions(environment, v_disc_indic, directed=True, max_options=max_options) # v_directed_sa, v_directed_aa = get_abstractions(environment, v_indic, directed=True, max_options=max_options) # Identity action abstraction. identity_sa, identity_aa = get_sa(environment, default=True), get_aa(environment, default=True) if exp_type == "core": # Core only abstraction types. q_directed_sa, q_directed_aa = get_abstractions( environment, q_indic, directed=True, max_options=max_options) rand_directed_sa, rand_directed_aa = get_abstractions( environment, rand_indic, directed=True, max_options=max_options) pblocks_sa, pblocks_aa = get_sa( environment, default=True), action_abs.aa_baselines.get_policy_blocks_aa( environment, incl_prim_actions=True, num_options=max_options) # =================== # === Make Agents === # =================== # Base Agents. agent_class = QLearningAgent if agent_class_str == "ql" else RMaxAgent rand_agent = RandomAgent(actions) baseline_agent = agent_class(actions, gamma=gamma) if mdp_class == "pblocks": baseline_agent.epsilon = 0.01 # Abstraction Extensions. agents = [] vabs_agent_directed = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=v_directed_sa, action_abstr=v_directed_aa, name_ext="v-sa+aa") if exp_type == "core": # Core only agents. qabs_agent_directed = AbstractionWrapper( agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=q_directed_sa, action_abstr=q_directed_aa, name_ext="q-sa+aa") rabs_agent_directed = AbstractionWrapper( agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=rand_directed_sa, action_abstr=rand_directed_aa, name_ext="rand-sa+aa") pblocks_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=pblocks_sa, action_abstr=pblocks_aa, name_ext="pblocks") agents = [ vabs_agent_directed, qabs_agent_directed, rabs_agent_directed, pblocks_agent, baseline_agent ] elif exp_type == "combo": # Combo only agents. aa_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=identity_sa, action_abstr=v_directed_aa, name_ext="aa") sa_agent = AbstractionWrapper(agent_class, actions, str(environment), max_option_steps=max_option_steps, state_abstr=v_directed_sa, action_abstr=identity_aa, name_ext="sa") agents = [vabs_agent_directed, sa_agent, aa_agent, baseline_agent] # Run experiments. if multi_task: steps = 999999 if x_axis_num_options else steps run_agents_multi_task(agents, environment, task_samples=task_samples, steps=steps, episodes=episodes, reset_at_terminal=True) else: run_agents_on_mdp(agents, environment, instances=20, episodes=30, reset_at_terminal=True)
def get_sa_experiment_agents(environment, AgentClass, pac=False): ''' Args: environment (simple_rl.MDPDistribution) AgentClass (Class) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() if pac: # PAC State Abstractions. sa_qds_test = compute_pac_sa( environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.2) sa_qs_test = compute_pac_sa( environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.2) sa_qs_exact_test = compute_pac_sa( environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) else: # Compute state abstractions. sa_qds_test = get_sa(environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.1) sa_qs_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.1) sa_qs_exact_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) # Make Agents. agent = AgentClass(actions, gamma=gamma) params = { "actions": actions } if AgentClass is not RMaxAgent else { "actions": actions, "s_a_threshold": 2, "horizon": 5 } sa_qds_agent = AbstractionWrapper(AgentClass, agent_params=params, state_abstr=sa_qds_test, name_ext="$-\phi_{Q_d^*}$") sa_qs_agent = AbstractionWrapper(AgentClass, agent_params=params, state_abstr=sa_qs_test, name_ext="$-\phi_{Q_\epsilon^*}$") sa_qs_exact_agent = AbstractionWrapper(AgentClass, agent_params=params, state_abstr=sa_qs_exact_test, name_ext="-$\phi_{Q^*}$") agents = [agent, sa_qds_agent, sa_qs_agent, sa_qs_exact_agent] # if isinstance(environment.sample(), FourRoomMDP) or isinstance(environment.sample(), ColorMDP): # # If it's a fourroom add the handcoded one. # sa_hand_test = get_sa(environment, indic_func=ind_funcs._four_rooms) # sa_hand_agent = AbstractionWrapper(AgentClass, agent_params=params, state_abstr=sa_hand_test, name_ext="$-\phi_h$") # agents += [sa_hand_agent] return agents
def get_exact_vs_approx_agents(environment, incl_opt=True): ''' Args: environment (simple_rl.MDPDistribution) incl_opt (bool) Returns: (list) ''' actions = environment.get_actions() gamma = environment.get_gamma() exact_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0) approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) ql_agent = QLearningAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) ql_exact_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") ql_approx_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") ql_agents = [ql_agent, ql_exact_agent, ql_approx_agent] dql_agent = DoubleQAgent(actions, gamma=gamma, epsilon=0.1, alpha=0.05) dql_exact_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") dql_approx_agent = AbstractionWrapper(DoubleQAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") dql_agents = [dql_agent, dql_exact_agent, dql_approx_agent] rm_agent = RMaxAgent(actions, gamma=gamma) rm_exact_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=exact_qds_test, name_ext="-exact") rm_approx_agent = AbstractionWrapper(RMaxAgent, agent_params={"actions": actions}, state_abstr=approx_qds_test, name_ext="-approx") rm_agents = [rm_agent, rm_exact_agent, rm_approx_agent] if incl_opt: vi = ValueIteration(environment) vi.run_vi() opt_agent = FixedPolicyAgent(vi.policy, name="$\pi^*$") sa_vi = AbstractValueIteration( environment, sample_rate=50, max_iterations=3000, delta=0.0001, state_abstr=approx_qds_test, action_abstr=ActionAbstraction( options=[], prim_actions=environment.get_actions())) sa_vi.run_vi() approx_opt_agent = FixedPolicyAgent(sa_vi.policy, name="$\pi_\phi^*$") dql_agents += [opt_agent, approx_opt_agent] return ql_agents
def diff_sampling_distr_experiment(): ''' Summary: Runs ''' # Make MDP and Demo Policy. params = get_params() mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) # Make a NN for each sampling param. sampling_params = [0.0, 0.5, 1.0] test_mdp = CartPoleMDP() # agents = {"demo": demo_agent} sess = tf.Session() for epsilon in sampling_params: with tf.variable_scope('nn_sa' + str(epsilon), reuse=False) as scope: print "epsilon", epsilon # tf.reset_default_graph() params["epsilon"] = epsilon abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False) nn_sa = NNStateAbstr(abstraction_net) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$QL_\\phi-\\epsilon=" + str(epsilon) + "$" }, state_abstr=nn_sa) agents[epsilon] = sa_agent with tf.variable_scope('demo') as scope: abstraction_net_rand = make_nn_sa(mdp_demo_policy_dict, sess, params, verbose=False, sample_type="rand") nn_sa_rand = NNStateAbstr(abstraction_net_rand) sa_agent_rand = AbstractionWrapper(QLearningAgent, agent_params={ "actions": env.get_actions(), "name": "$D \\sim U(S)$" }, state_abstr=nn_sa_rand, name_ext="") agents["rand"] = sa_agent_rand run_agents_on_mdp(agents.values(), test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) sess.close()
def main(): # ====================== # == Make Environment == # ====================== params = get_params() num_test_mdps = 6 # 6 is max. mdp_demo_policy_dict = {} env = GymMDP(env_name='CartPole-v0') obs_size = env.get_num_state_feats() mdp_demo_policy_dict[env] = cpd.expert_cartpole_policy if params['multitask']: # Make distribution. mdp_dist_dict = { CartPoleMDP(gravity=gravity): 1.0 / num_test_mdps for gravity in [5.0, 6.0, 8.0, 12.0][:num_test_mdps] } test_mdp = MDPDistribution(mdp_dist_dict) else: test_mdp = CartPoleMDP() # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "cartpole_nn_sa" abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={ "alpha": params['rl_learning_rate'], "epsilon": 0.2, "actions": test_mdp.get_actions() }, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== if params['multitask']: run_agents_lifelong([sa_agent, linear_agent], test_mdp, samples=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False) else: # demo_agent = FixedPolicyAgent(cpd.expert_cartpole_policy) run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def _info_sa_val_and_size_plot_wrapper(beta, param_dict): ''' Args: beta (float): stands for $beta$ in the info_sa algorithm. param_dict (dict): contains relevant parameters for plotting. Returns: (tuple): (int) The value achieved by $pi_phi^*$ in the MDP. (int) The number of abstract states Notes: This serves as a wrapper to cooperate with PlotFunc. ''' # Grab params. mdp = param_dict["mdp"] demo_policy_lambda = param_dict["demo_policy_lambda"] iters = param_dict["iters"] convergence_threshold = param_dict["convergence_threshold"] is_deterministic_ib = param_dict["is_deterministic_ib"] use_crisp_policy = param_dict["use_crisp_policy"] is_agent_in_control = param_dict["is_agent_in_control"] # --- Run DIBS to convergence --- if is_agent_in_control: # Run info_sa with the agent controlling the MDP. import agent_in_control pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa( mdp, demo_policy_lambda, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. from info_sa import run_info_sa pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa( mdp, demo_policy_lambda, iters=50, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) print "\tEvaluating..." # Make abstract agent. from info_sa import get_lambda_policy # Make the policy deterministic if needed. if use_crisp_policy: from info_sa import make_policy_det_max_policy policy = get_lambda_policy( make_policy_det_max_policy(abstr_policy_pmf)) else: policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) # -- Compute Values -- phi = convert_prob_sa_to_sa( prob_s_phi) if is_deterministic_ib else prob_s_phi abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=phi, agent_params={ "policy": policy, "name": "$\\pi_\\phi$" }, name_ext="") # Compute value of abstract policy w/ coding distribution. value = evaluate_agent(agent=abstr_agent, mdp=mdp, instances=100) # -- Compute size of S_\phi -- if is_deterministic_ib: s_phi_size = phi.get_num_abstr_states() else: # TODO: could change this to {s in S : Pr(s) > 0}. from rlit_utils import entropy s_phi_size = entropy(pmf_s_phi) return value, s_phi_size