def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) tabular_agent = CherryQAgent(mdp, model=lambda *x: ActionValueFunction(*x, init=1.0), name='Tabular', lr=0.7) linear_agent = CherryQAgent(mdp, model=lambda *x: nn.Linear(*x), name='Linear', lr=0.1) mlp_agent = CherryQAgent(mdp, model=lambda *x: MLP(*x), name='MLP', lr=0.07) # Run experiment and make plot. agents = [rand_agent, ql_agent, tabular_agent, linear_agent, mlp_agent] run_agents_on_mdp(agents, mdp, instances=10, episodes=50, steps=50, open_plot=open_plot)
def save(args): mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) agent = DiaynAgent(sess=None, obs_dim=obs_dim, num_actions=num_actions, num_options=args.noptions, action_dim=action_dim, action_bound=action_bound, batch_size=32, update_freq=32, alpha=1.0) agent.set_diversity(True) run_agents_on_mdp([agent], mdp, episodes=args.snepisodes, steps=args.snsteps, instances=1, cumulative_plot=True) if args.trajdir == '__default': prefix = '.' else: prefix = args.trajdir agent.save(directory=prefix + '/vis' + '/' + str(args.task) + 'option' + str(args.noptions) + 'diayn', name='diayn-pretrain')
def main(): # Setup MDP, Agents. size = 5 agent = { "x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0 } blocks = [{"x": size, "y": 1}] lavas = [{ "x": x, "y": y } for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))] mdp = TrenchOOMDP(size, size, agent, blocks, lavas) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)
def main(): # Setup experiment parameters, agents, mdp. num_days = 3 per_hour = False time_per_step = 20.0 # in minutes. loc, steps = "nola", int(24 * (60 / time_per_step) * num_days) panel_step = 1.0 # Angle movement per action. # If per hour is true, plots every hour long reward chunk, otherwise every day. rew_step_count = (steps / num_days) / 24 if per_hour else (steps / num_days) sun_agents, sun_solar_mdp = setup_experiment("sun_percept", loc=loc, panel_step=panel_step, time_per_step=time_per_step) # # Run experiments. run_agents_on_mdp(sun_agents, sun_solar_mdp, instances=5, episodes=1, steps=steps, clear_old_results=True, rew_step_count=rew_step_count, verbose=False)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=8, height=3, init_loc=(1, 1), goal_locs=[(8, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=20, episodes=300, steps=20, open_plot=open_plot, track_success=True, success_reward=1)
def main(): # Grab experiment params. mdp = BadChainMDP(gamma=0.95, kappa=0.001) actions = mdp.get_actions() # ======================= # == Make Abstractions == # ======================= sa_q_eps = get_sa(mdp, indic_func=indicator_funcs._q_eps_approx_indicator, epsilon=0.1) # RMax Agents. rmax_agent = RMaxAgent(actions) abstr_rmax_agent = AbstractionWrapper(RMaxAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") # Delayed Q Agents. del_q_agent = DelayedQAgent(actions) abstr_del_q_agent = AbstractionWrapper(DelayedQAgent, state_abstr=sa_q_eps, agent_params={"actions": actions}, name_ext="-$\\phi_{Q_\\epsilon^*}$") run_agents_on_mdp( [rmax_agent, abstr_rmax_agent, del_q_agent, abstr_del_q_agent], mdp, instances=50, steps=250, episodes=1)
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
def main(open_plot=True): # gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)]) # num_feats = gym_mdp.get_num_state_feats() # lin_agent = QLearnerAgent(gym_mdp.actions, alpha=0.4, epsilon=0.4) # rand_agent = RandomAgent(gym_mdp.actions) # run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot) # gym_mdp = GridWorldMDP(width=10, height=10, init_loc=(1,1), goal_locs=[(10,10)]) # num_feats = gym_mdp.get_num_state_feats() # lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False,rbf=True) # rand_agent = RandomAgent(gym_mdp.actions) # run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=50, episodes=200, steps=100, open_plot=open_plot,verbose=True) gym_mdp = GymMDP(env_name='CartPole-v0', render=False) num_feats = gym_mdp.get_num_state_feats() lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=False, rbf=True) rand_agent = RandomAgent(gym_mdp.actions) run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=5, episodes=1000, steps=100, open_plot=open_plot)
def main(): # Set Params. mdp_class, task_samples, episodes, steps, grid_dim, AgentClass = get_params( set_manually=False) experiment_type = "sa" lifelong = True resample_at_terminal = False reset_at_terminal = False gamma = 0.95 # ====================== # == Make Environment == # ====================== environment = make_mdp.make_mdp_distr( mdp_class=mdp_class, grid_dim=grid_dim) if lifelong else make_mdp.make_mdp( mdp_class=mdp_class, grid_dim=grid_dim) environment.set_gamma(gamma) # ================= # == Make Agents == # ================= agents = [] if experiment_type == "sa": # SA experiment. agents = get_sa_experiment_agents(environment, AgentClass) elif experiment_type == "combo": # AA experiment. agents = get_combo_experiment_agents(environment) elif experiment_type == "exact_v_approx": agents = get_exact_vs_approx_agents(environment, incl_opt=(not multi_task)) elif experiment_type == "opt": agents = get_optimal_policies(environment) else: print "Experiment Error: experiment type unknown (" + experiment_type + "). Must be one of {sa, combo, exact_v_approx}." quit() # Run! if lifelong: run_agents_lifelong(agents, environment, samples=task_samples, steps=steps, episodes=episodes, reset_at_terminal=reset_at_terminal, resample_at_terminal=resample_at_terminal, cumulative_plot=True, clear_old_results=True) else: run_agents_on_mdp(agents, environment, instances=task_samples, steps=steps, episodes=episodes, reset_at_terminal=reset_at_terminal, track_disc_reward=False)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='CartPole-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. lin_agent = LinearQLearnerAgent(gym_mdp.actions, num_features=num_feats, alpha=0.4, epsilon=0.4, anneal=True) rand_agent = RandomAgent(gym_mdp.actions) run_agents_on_mdp([lin_agent, rand_agent], gym_mdp, instances=10, episodes=30, steps=10000, open_plot=open_plot)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='Breakout-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. rand_agent = RandomAgent(gym_mdp.get_actions()) lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats) run_agents_on_mdp([lin_q_agent, rand_agent], gym_mdp, instances=5, episodes=50000, steps=200, open_plot=open_plot, verbose=False)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9}, actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) abstr_identity_agent = AbstractionWrapper(QLearningAgent, agent_params={"epsilon":0.9, "actions":mdp.get_actions()}) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent, abstr_identity_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def main(): # Setup MDP. actual_args = { "width": 10, "height": 10, "init_loc": (1, 1), "goal_locs": [(10, 10)], "lava_locs": [(1, 10), (3, 10), (5, 10), (7, 10), (9, 10)], "gamma": 0.9, "walls": [ (2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6), (4, 7), (4, 8), (4, 9), (6, 2), (6, 3), (6, 4), (6, 5), (6, 6), (6, 7), (6, 8), (6, 9), (8, 2), (8, 3), (8, 4), (8, 5), (8, 6), (8, 7), (8, 8), (8, 9) ], "slip_prob": 0.01, "lava_cost": 1.0, "step_cost": 0.1 } mdp = GridWorldMDP(**actual_args) # Initialize the custom Q function for a q-learning agent. This should be equivalent to potential shaping. # This should cause the Q agent to learn more quickly. custom_q = defaultdict(lambda: defaultdict(lambda: 0)) custom_q[GridWorldState(5, 1)]['right'] = 1.0 custom_q[GridWorldState(2, 1)]['right'] = 1.0 # Make a normal q-learning agent and another initialized with the custom_q above. # Finally, make a random agent to compare against. ql_agent = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4) ql_agent_pot = QLearningAgent(actions=mdp.get_actions(), epsilon=0.2, alpha=0.4, custom_q_init=custom_q, name="PotQ") rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, ql_agent_pot, rand_agent], mdp, instances=2, episodes=60, steps=200, open_plot=True, verbose=True)
def main(open_plot=True): # Setup MDP, Agents. mdp = BanditMDP() lin_agent = LinUCBAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, lin_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, open_plot=open_plot)
def main(open_plot=True): state_colors = defaultdict(lambda: defaultdict(lambda: "white")) state_colors[3][2] = "red" # Setup MDP, Agents. mdp = ColoredGridWorldMDP(state_colors) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
def main(open_plot=True): # Setup MDP. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.05) # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=50, steps=10, open_plot=open_plot)
def main(open_plot=True): state_colors = defaultdict(lambda:defaultdict(lambda:"white")) state_colors[3][2] = "red" # Setup MDP, Agents. mdp = ColoredGridWorldMDP(state_colors) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=15, episodes=500, steps=40, open_plot=open_plot)
def test_utility(args, mdp): # The number of options to the performance # TODO: Compare the utility of point options vs. subgoal options? now_ts = str(datetime.now().timestamp()) origMatrix, intToS = GetAdjacencyMatrix(mdp) known_region = list(intToS.values()) # Known region is a set of MDPStates. n_ops_list = [2, 4, 8, 16, 32] agents = [] ql_agent = QLearningAgent(actions=mdp.get_actions()) agents.append(ql_agent) method = 'fiedler' for n_ops in n_ops_list: _, foptions, _, fvectors = GetOption(mdp, n_ops, matrix=origMatrix, intToS=intToS, option_type=args.optiontype, method=method) print('#options=', n_ops) print(foptions) if args.optiontype == 'subgoal': known_region = list( intToS.values()) # Known region is a set of MDPStates. eigenoption_agent = build_subgoal_option_agent( mdp, foptions, known_region, vectors=fvectors, name='-' + method + '-' + args.optiontype + '-' + str(n_ops)) else: eigenoption_agent = build_point_option_agent( mdp, foptions, agent=QLearningAgent, policy='vi', name='-' + method + '-' + args.optiontype + '-' + str(n_ops)) agents.append(eigenoption_agent) run_agents_on_mdp(agents, mdp, instances=args.ninstances, episodes=args.nepisodes, steps=args.nsteps, open_plot=True, track_disc_reward=True, cumulative_plot=True, dir_for_plot="results/")
def main(): # create mdp using own definition mdp = tfeMDP() # Three different agents to compare how each do against each other rand_agent = RandomAgent(actions=mdp.get_actions()) rmax_agent = RMaxAgent(actions=mdp.get_actions()) agent = QLearningAgent(actions=mdp.get_actions()) # Function that actually runs everything and generates the appropriate # graphs and statistics defining how each agent did run_agents_on_mdp([agent, rmax_agent, rand_agent], mdp, instances=200, episodes=100, steps=1000)
def main(): # Setup MDP, Agents. size = 5 agent = {"x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0} blocks = [{"x": size, "y": 1}] lavas = [{"x": x, "y": y} for x, y in map(lambda z: (z + 1, (size + 1) / 2), range(size))] mdp = TrenchOOMDP(size, size, agent, blocks, lavas) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='CartPole-v0', render=True) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. q_learning_agent = LinearQAgent(gym_mdp.get_actions(), num_feats) run_agents_on_mdp([q_learning_agent], gym_mdp, instances=1, episodes=400, steps=210, open_plot=open_plot, verbose=True)
def restore(args): mdp, obs_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) rst = DiaynAgent(sess=None, obs_dim=obs_dim, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=args.noptions, batch_size=1, update_freq=1, alpha=1.0) rst.restore(directory=prefix + '/vis' + '/' + str(args.task) + 'option' + str(args.noptions) + 'diayn', name='diayn-pretrain') rst.set_diversity(False) oagent = OptionAgent(sess=None, obs_dim=obs_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, name='diayn' + str(args.noptions)) for i in range(args.noptions): op = DiaynOption(rst, i, args.termprob) oagent.add_option(op) run_agents_on_mdp([oagent], mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True)
def main(open_plot=True): # Setup MDP, Agents. mdp = BanditMDP() lin_agent = LinUCBAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, lin_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, open_plot=open_plot)
def main(open_plot=True): # Gym MDP gym_mdp = GymMDP(env_name='Breakout-v0', render=False) num_feats = gym_mdp.get_num_state_feats() # Setup agents and run. rand_agent = RandomAgent(gym_mdp.get_actions()) lin_q_agent = LinearQAgent(gym_mdp.get_actions(), num_feats) run_agents_on_mdp([lin_q_agent, rand_agent], gym_mdp, instances=5, episodes=50000, steps=200, open_plot=open_plot, verbose=False)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=10, height=10, init_loc=(1, 1), goal_locs=[(10, 10)]) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=5, episodes=100, steps=150, open_plot=open_plot)
def learn_w_abstr(mdp, demo_policy, beta_list=[20], is_deterministic_ib=False): ''' Args: mdp (simple_rl.MDP) demo_policy (lambda : simple_rl.State --> str) beta_list (list) is_deterministic_ib (bool) Summary: Computes a state abstraction for the given beta and compares Q-Learning with and without the abstraction. ''' # Run info_sa. dict_of_phi_pmfs = {} for beta in beta_list: pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy, iters=300, beta=beta, convergence_threshold=0.0001, is_deterministic_ib=is_deterministic_ib) # Translate abstractions. prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) #ground state to abstract state dict_of_phi_pmfs[beta] = crisp_s_phi print("crisp_s_phi:" ) for single_state in crisp_s_phi.get_abs_states(): print(str(type(single_state))) print("ground_for_above:" + str(crisp_s_phi.get_ground_states_in_abs_state(single_state))) print("ground states:") for ground_states in crisp_s_phi.get_ground_states(): print(str(type(ground_states))) print(len(crisp_s_phi.get_ground_states())) print(len(crisp_s_phi.get_abs_states())) # Make agents. demo_agent = FixedPolicyAgent(demo_policy, name="$\\pi_d$") ql_agent = QLearningAgent(mdp.get_actions()) agent_dict = {} for beta in beta_list: beta_phi = dict_of_phi_pmfs[beta] ql_abstr_agent = AbstractionWrapper(QLearningAgent, state_abstr=dict_of_phi_pmfs[beta], agent_params={"actions":mdp.get_actions(), "anneal":True}) agent_dict[beta] = ql_abstr_agent # Learn. run_agents_on_mdp(agent_dict.values(), mdp, episodes=100, steps=10, instances=5) # Print num abstract states. for beta in dict_of_phi_pmfs.keys(): print "beta |S_phi|:", beta, dict_of_phi_pmfs[beta].get_num_ground_states() print
def main(): # ====================== # == Make Environment == # ====================== params={} params['multitask']=False params['env_name']="LunarLander-v2" params['obs_size']=8 params['num_iterations_for_abstraction_learning']=500 params['learning_rate_for_abstraction_learning']=0.005 params['abstraction_network_hidden_layers']=2 params['abstraction_network_hidden_nodes']=200 params['num_samples_from_demonstrator']=10000 params['episodes']=200 params['steps']=1000 params['num_instances']=5 params['rl_learning_rate']=0.005 mdp_demo_policy_dict = {} env_name = "LunarLander-v2" env_gym = gym.make(env_name) obs_size = len(env_gym.observation_space.high) env = GymMDP(env_name='LunarLander-v2', render=True, render_every_n_episodes=20) test_mdp = env #test mdp is the same mdp_demo_policy_dict[env]=lpd.expert_lunar_policy # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() nn_sa_file_name = "lunar_nn_sa" num_iterations = 300 abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess,params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features, alpha=params['rl_learning_rate']) sa_agent = AbstractionWrapper(QLearningAgent, agent_params={"alpha":params['rl_learning_rate'],"actions":test_mdp.get_actions(), "anneal":True}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=True, track_success=True, success_reward=100)
def main(): # ====================== # == Make Environment == # ====================== params = get_params() # ============================ # == Make test and train environments # == along with demonstrator(s) # ============================ mdp_demo_policy_dict, test_mdp = make_mdp_demo_policy_dict( multitask=params['multitask']) expert_puddle_policy = ppd.get_demo_policy_given_goal( test_mdp.get_goal_locs()[0]) demo_agent = FixedPolicyAgent(expert_puddle_policy) # ============================ # == Make State Abstraction == # ============================ sess = tf.Session() abstraction_net = make_nn_sa(mdp_demo_policy_dict, sess, params) nn_sa = NNStateAbstr(abstraction_net) # ================= # == Make Agents == # ================= actions = test_mdp.get_actions() num_features = test_mdp.get_num_state_feats() linear_agent = LinearQAgent(actions=actions, num_features=num_features) sa_agent = AbstractionWrapper( QLearningAgent, agent_params={"actions": test_mdp.get_actions()}, state_abstr=nn_sa, name_ext="$-\\phi$") # ==================== # == Run Experiment == # ==================== run_agents_on_mdp([sa_agent, linear_agent], test_mdp, instances=params['num_instances'], episodes=params['episodes'], steps=params['steps'], verbose=False)
def main(): # Experiments from IAAI paper: # num_days = 1 # time_per_step = 10 for single axis, 20 for dual. # panel_step = 5, dual: 20 # reflective = 0.55 # instances = 10 # episodes = 50, dual: 100 # Setup experiment parameters, agents, mdp. num_days = 200 per_hour = True loc, percept_type, dual_axis = parse_args() time_per_step = 10.0 if not dual_axis else 20.0 # in minutes. steps = int(24 * (60 / time_per_step) * num_days) panel_step = 10 if not dual_axis else 20 reflective_index = 0.55 # Set experiment # episodes and # instances. episodes = 1 if not dual_axis else 100 episodes = 1 if num_days == 365 else episodes instances = 50 # If per hour is true, plots every hour long reward chunk, otherwise every day. rew_step_count = (steps / num_days) / 24 if per_hour else (steps / num_days) sun_agents, sun_solar_mdp = setup_experiment( percept_type=percept_type, loc=loc, dual_axis=dual_axis, panel_step=panel_step, time_per_step=time_per_step, reflective_index=reflective_index, instances=instances) # Run experiments. run_agents_on_mdp(sun_agents, sun_solar_mdp, instances=instances, episodes=episodes, steps=steps, clear_old_results=True, rew_step_count=rew_step_count, verbose=True)
def main(open_plot=True): # Setup MDP. mdp = PuddleMDP() # Make feature mappers. tile_coder = TileCoding(ranges=[[0, 1.0], [0, 1.0]], num_tiles=[4, 5], num_tilings=4) bucket_coder = BucketCoding(feature_max_vals=[1.0, 1.0], num_buckets=5) rbf_coder = RBFCoding() # Make agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) # Tabular agent w/ features. tile_coding_agent = FeatureWrapper(QLearningAgent, feature_mapper=tile_coder, agent_params={"actions":mdp.get_actions()}) bucket_coding_agent = FeatureWrapper(QLearningAgent, feature_mapper=bucket_coder, agent_params={"actions":mdp.get_actions()}) # Run experiment and make plot. run_agents_on_mdp([ql_agent, bucket_coding_agent], mdp, instances=10, episodes=100, steps=150, open_plot=open_plot)
def main(): args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=args.epsilon, alpha=args.alpha, explore=args.explore, anneal=args.anneal) viz = args.mode if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) rand_agent = RandomAgent(actions=mdp.get_actions()) run_agents_on_mdp([rand_agent, ql_agent], mdp, open_plot=True, episodes=60, steps=200, instances=5, success_reward=1) # mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta (float) is_deterministic_ib (bool): If True, run DIB, else IB. is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead. Summary: Runs info_sa and compares the value of the found policy with the demonstrator policy. ''' if is_agent_in_control: # Run info_sa with the agent controlling the MDP. pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) # Make demonstrator agent and random agent. demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$") rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$") # Make abstract agent. lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="") # Run. run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000) non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0] # Print state space sizes. demo_vi = ValueIteration(mdp) print "\nState Spaces Sizes:" print "\t|S| =", demo_vi.get_num_states() print "\tH(S_\\phi) =", entropy(pmf_s_phi) print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states() print "\tdelta_min =", min(non_zero_abstr_states) print "\tnum non zero states =", len(non_zero_abstr_states) print
def main(): # Setup MDP. w = 6 h = 6 mdp = GridWorld(width=w, height=h, init_loc=(1, 1), goal_locs=[(6, 6)], slip_prob=.1) # Setup Agents. rand_agent = RandomAgent(actions=mdp.get_actions()) ql_agent = QLearningAgent(actions=mdp.get_actions()) # Compute number of samples for R-MAX to achieve epsilon optimal behavior with high probability (1 - delta) compute_n_samples = False if compute_n_samples: epsilon = .1 delta = .05 m_r = np.log(2. / delta) / (2. * epsilon**2) m_t = 2. * (np.log(2**(float(w * h)) - 2.) - np.log(delta)) / (epsilon **2) n_samples = int(max(m_r, m_t)) else: n_samples = 30 simple_rl_rmax_agent = RMaxAgent(actions=mdp.get_actions(), gamma=.9, horizon=3, s_a_threshold=n_samples, name='SimpleRL-R-MAX') rmax_agent = RMax(actions=mdp.get_actions(), gamma=.9, count_threshold=n_samples) # Run experiment and make plot. run_agents_on_mdp([rand_agent, ql_agent, rmax_agent, simple_rl_rmax_agent], mdp, instances=5, episodes=100, steps=20, reset_at_terminal=True, verbose=False)
def main(open_plot=True): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], gamma=0.95, walls=[(2, 2)]) ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=20, open_plot=open_plot)
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":3, "y":2, "dest_x":2, "dest_y":3, "in_taxi":0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
def main(): # Paper experiments: # num_days = 1 # time_per_step = 10 # panel_step = 5, dual: 20 # reflective = 0.55 # instances = 10 # episodes = 50, dual: 100 # Setup experiment parameters, agents, mdp. num_days = 1 per_hour = True time_per_step = 10.0 # in minutes. loc, percept_type, dual_axis = parse_args() steps = int(24 * (60 / time_per_step) * num_days) panel_step = 5 reflective_index = 0.55 energy_breakdown_experiment = False # If per hour is true, plots every hour long reward chunk, otherwise every day. rew_step_count = (steps / num_days) / 24 if per_hour else (steps / num_days) sun_agents, sun_solar_mdp = setup_experiment( percept_type=percept_type, loc=loc, dual_axis=dual_axis, panel_step=panel_step, time_per_step=time_per_step, reflective_index=reflective_index, energy_breakdown_experiment=energy_breakdown_experiment) # # Run experiments. run_agents_on_mdp(sun_agents, sun_solar_mdp, instances=10, episodes=50, steps=steps, clear_old_results=True, rew_step_count=rew_step_count, verbose=False)
def main(open_plot=True): # Setup MDP. args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) if args.visualize: value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) else: custom_q = parse_custom_q_table(args.custom_q, args.default_q) agents = [] for agent in args.agents: if agent == 'q_learning': agents.append(QLearningAgent(actions=mdp.get_actions())) elif agent == 'potential_q': agents.append( QLearningAgent(actions=mdp.get_actions(), custom_q_init=custom_q, name="Potential_Q")) elif agent == 'random': agents.append(RandomAgent(actions=mdp.get_actions())) elif agent == 'rmax': agents.append(RMaxAgent(mdp.get_actions())) # Run experiment and make plot. run_agents_on_mdp(agents, mdp, instances=1, episodes=100, steps=100, open_plot=open_plot, verbose=True)