def _make_dqn_option_policy(mdp, subgoal, n_trajs=100, n_steps=100): ''' LEARN-DQN-AGENT-ON-MDP ''' # TODO: mdp + subgoal # TODO: How much should we train each policy? # Near optimal for now? env_name = mdp.env_name in_mdp = IntrinsicMDP(subgoal, env_name) # Build a subgoal reward function # TODO: implement a reward function based on the eigenvector # def intrinsic_r(state): # if state in subgoal: # return 1.0 # else: # return 0.0 # print('type(subgoal)=', subgoal) num_feats = in_mdp.get_num_state_feats() dqn_agent = LinearQAgent(in_mdp.get_actions(), num_feats) run_single_agent_on_mdp(dqn_agent, in_mdp, episodes=n_trajs, steps=n_steps) return dqn_agent.policy
def plot_parameters(pars, md): cur_cell_rewards = [ pars["white"][0], pars["yellow"][0], pars["red"][0], pars["green"][0], pars["purple"][0], -500 ] # cur_cell_rewards = pars print(cur_cell_rewards) md.mdp = NavigationWorldMDP(width=md.side, height=md.side, nav_cell_types=md.nav_cell_types, nav_cell_rewards=cur_cell_rewards, nav_cell_p_or_locs=md.nav_cell_p_or_locs, goal_cell_types=md.goal_cell_types, goal_cell_rewards=md.goal_rew, goal_cell_locs=md.goal_cell_loc, init_loc=md.start_loc, rand_init=False, gamma=0.95, slip_prob=0, step_cost=0) md.agent = QLearningAgent(md.mdp.get_actions(), epsilon=md.eps) run_single_agent_on_mdp(md.agent, md.mdp, episodes=md.episodes, steps=md.steps) md.agent.epsilon = 0 md.mdp.slip_prob = 0 _, steps_taken, reward, states = md.run_experiment(md.agent, md.mdp) # print('Best result observation:') print([md.count_turns(states), steps_taken, reward]) # print('Observed data result:') # print(md.observed_data) md.mdp.visualize_grid(trajectories=[states], plot=False) return [md.count_turns(states), steps_taken, reward]
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4) viz = parse_args() # Choose viz type. viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent) elif viz == "interactive": mdp.visualize_interaction()
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) viz = parse_args() # Choose viz type. viz = "value" if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200) elif viz == "interactive": # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc. mdp.visualize_interaction()
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
def func(self, *params, n_obs=100, batch_size=1, random_state=None): """Generate a sequence of samples from the Open AI env. Parameters ---------- params : array of envs random_state : RandomState, optional """ # fix locations instead of probabilities! fixed map multiple init_locs! rewards = [] params = np.array(params).reshape(self.param_dim, -1) batches = params.shape[1] for i in range(batches): cur_cell_rewards = [x for x in params[:, i]] # reward for black cells is fixed cur_cell_rewards.append(-500) if self.prev_cell_rewards != cur_cell_rewards: self.mdp = NavigationWorldMDP( width=self.side, height=self.side, nav_cell_types=self.nav_cell_types, nav_cell_rewards=cur_cell_rewards, nav_cell_p_or_locs=self.nav_cell_p_or_locs, goal_cell_types=self.goal_cell_types, goal_cell_rewards=self.goal_rew, goal_cell_locs=self.goal_cell_loc, init_loc=self.start_loc, rand_init=False, slip_prob=0) self.agent = QLearningAgent(self.mdp.get_actions(), epsilon=self.eps) run_single_agent_on_mdp(self.agent, self.mdp, episodes=self.episodes, steps=self.steps) self.agent.epsilon = 0 self.mdp.slip_prob = self.slip # print('Parameters:') # print(cur_cell_rewards) for j in range(1): finished, steps_taken, reward, states = self.run_experiment( self.agent, self.mdp) turns = self.count_turns(states) ep_reward = [turns, steps_taken, reward] # print('Corresponding reward:') # print([turns, steps_taken, reward]) # self.mdp.visualize_grid(trajectories=[states], traj_colors_auto=False) rewards.append(ep_reward) self.prev_cell_rewards = cur_cell_rewards return rewards
def __init__(self): self.base_human_model = PuddleMDP() self.base_agent = LinearQAgent( actions=self.base_human_model.get_actions(), num_features=2) run_single_agent_on_mdp(self.base_agent, self.base_human_model, episodes=10000, steps=4) # TODO Add other settings self.current_agent = self.base_agent self.current_mdp = self.base_human_model
def __init__(self): self.base_human_model = PuddleMDP() self.base_agent = ModQLearningAgent( actions=self.base_human_model.get_actions(), epsilon=0.5, anneal=True) run_single_agent_on_mdp(self.base_agent, self.base_human_model, episodes=10000, steps=60, verbose=True) print("Q func", self.base_agent.q_func) self.test_run = False if self.test_run: self.novice_model_1 = self.base_human_model self.novice_model_2 = self.base_human_model self.fully_actulized_model = self.base_human_model self.novice_agent_1 = self.base_agent self.novice_agent_2 = self.base_agent self.fully_actulized_agent = self.base_agent else: self.novice_model_1 = PuddleMDP2() self.novice_agent_1 = ModQLearningAgent( actions=self.novice_model_1.get_actions(), epsilon=0.5, anneal=True) run_single_agent_on_mdp(self.novice_agent_1, self.novice_model_1, episodes=10000, steps=60, verbose=True) self.novice_model_2 = PuddleMDP3() self.novice_agent_2 = ModQLearningAgent( actions=self.novice_model_2.get_actions(), epsilon=0.5, anneal=True) run_single_agent_on_mdp(self.novice_agent_2, self.novice_model_2, episodes=10000, steps=60, verbose=True) self.fully_actulized_model = PuddleMDP4() self.fully_actulized_agent = ModQLearningAgent( actions=self.fully_actulized_model.get_actions(), epsilon=0.5, anneal=True) run_single_agent_on_mdp(self.fully_actulized_agent, self.fully_actulized_model, episodes=10000, steps=60, verbose=True) # TODO Add other settings self.current_agent = self.base_agent self.current_mdp = self.base_human_model
def main(open_plot=True): # Taxi initial state attributes.. agent = {"x":1, "y":1, "has_passenger":0} passengers = [{"x":3, "y":2, "dest_x":2, "dest_y":3, "in_taxi":0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) # Agents. ql_agent = QLearningAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=1, steps=500, reset_at_terminal=True, open_plot=open_plot)
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(9, 9, goal_locs=[(9, 9)], gamma=0.95) ql_agent = QLearnerAgent(mdp.get_actions()) viz = parse_args() if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy vi = ValueIteration(mdp) vi.run_vi() policy = vi.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print "\n", str(ql_agent), "interacting with", str(mdp) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent)
def branching_factor_experiment(min_options=0, max_options=20, increment=2, instances=5, epsilon=0.05): ''' Args: min_options (int) max_options (int) increment (int) Summary: Runs an experiment contrasting learning performance for different # options. ''' # Define MDP. grid_size = 7 mdp = FourRoomMDP(width=grid_size, height=grid_size, goal_locs=[(grid_size, grid_size)]) # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) state_abstr = core.compute_phi_given_m(mdp, four_rooms_predicate_9x9, level=1, states=states) x_axis = range(min_options, max_options + 1, increment) y_axis = defaultdict(list) #[] #[0] * len(x_axis) conf_intervals = defaultdict(list) num_options_performance = defaultdict(lambda: defaultdict(list)) # Choose dependent variable (either #steps per episode or #episodes). d_var_range = [(20, 5), (40, 250), (400, 2500)] for steps, episodes in d_var_range: print "steps, episodes", steps, episodes # Evaluate. for i, instance in enumerate(range(instances)): print "\tInstance", instance + 1, "of", str(instances) + "." # Make initial Options. for num_options in x_axis: options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_options - 1, eps=epsilon) action_abstr = ActionAbstraction( options=options, prim_actions=mdp.get_actions()) # Make agent. AgentClass = RMaxAgent # DoubleQAgent, QLearningAgent, SarsaAgent sa_aa_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O$") _, _, value_per_episode = run_single_agent_on_mdp( sa_aa_agent, mdp, episodes=episodes, steps=steps) mdp.reset() num_options_performance[(steps, episodes)][num_options].append( value_per_episode[-1]) ############ # Other types # Just state abstraction. steps, episodes = d_var_range[-1][0], d_var_range[-1][1] sa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=None, name_ext="-$\\phi$") _, _, value_per_episode = run_single_agent_on_mdp(sa_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["phi"].append( value_per_episode[-1]) y_axis["phi"] = [value_per_episode[-1]] # Run random options. options = make_fixed_random_options(mdp, state_abstr) action_abstr = ActionAbstraction(options=options, prim_actions=mdp.get_actions()) AgentClass = QLearningAgent rand_opt_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O_{\text{random}}$") _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["random"].append( value_per_episode[-1]) y_axis["random"] = [value_per_episode[-1]] # Makeoptimal agent. value_iter = ValueIteration(mdp) value_iter.run_vi() optimal_agent = FixedPolicyAgent(value_iter.policy) _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent, mdp, episodes=episodes, steps=steps) y_axis["optimal"] = [value_per_episode[-1]] total_steps = d_var_range[0][0] * d_var_range[0][1] # Confidence intervals. for dependent_var in d_var_range: for num_options in x_axis: # Compute mean and standard error. avg_for_n = float( sum(num_options_performance[dependent_var] [num_options])) / instances std_deviation = np.std( num_options_performance[dependent_var][num_options]) std_error = 1.96 * (std_deviation / math.sqrt( len(num_options_performance[dependent_var][num_options]))) y_axis[dependent_var].append(avg_for_n) conf_intervals[dependent_var].append(std_error) plt.xlabel("$|O_\\phi|$") plt.xlim([1, len(x_axis)]) plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$") plt.tight_layout() # Keeps the spacing nice. # Add just state abstraction. ep_val_del_q_phi = y_axis["phi"] label = "$O_{\\phi}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis), marker="+", linestyle="--", linewidth=1.0, color=PLOT_COLORS[-1], label=label) # Add random options. ep_val_del_q = y_axis["random"] label = "$O_{random}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q] * len(x_axis), marker="x", linestyle="--", linewidth=1.0, color=PLOT_COLORS[0]) #, label=label) # Add optimal. ep_val_optimal = y_axis["optimal"] plt.plot(x_axis, [ep_val_optimal] * len(x_axis), linestyle="-", linewidth=1.0, color=PLOT_COLORS[1]) #, label="$\\pi^*$") for i, dependent_var in enumerate(d_var_range): total_steps = dependent_var[0] * dependent_var[1] label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str( str(total_steps).count("0")) + "$" plt.plot(x_axis, y_axis[dependent_var], marker="x", color=PLOT_COLORS[i + 2], linewidth=1.5, label=label) # Confidence intervals. top = np.add(y_axis[dependent_var], conf_intervals[dependent_var]) bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var]) plt.fill_between(x_axis, top, bot, alpha=0.25, color=PLOT_COLORS[i + 2]) plt.legend() plt.savefig("branching_factor_results.pdf", format="pdf") plt.cla() plt.close()
from simple_rl.agents import QLearnerAgent, RandomAgent from simple_rl.tasks import TaxiOOMDP, BlockDudeOOMDP from simple_rl.run_experiments import run_agents_on_mdp, run_single_agent_on_mdp # Taxi initial state attributes.. agent = {"x": 1, "y": 1, "has_passenger": 0} passengers = [{"x": 3, "y": 2, "dest_x": 2, "dest_y": 3, "in_taxi": 0}] walls = [] mdp = TaxiOOMDP(width=4, height=4, agent=agent, walls=walls, passengers=passengers) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) viz = False if viz: # Visualize Taxi. run_single_agent_on_mdp(ql_agent, mdp, episodes=50, steps=1000) mdp.visualize_agent(ql_agent) else: # Run experiment and make plot. run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=10, episodes=100, steps=150, reset_at_terminal=True)
def prior_use_experiment(run_experiment=True, open_plot=True, verbose=True): """ Prior use experiment: Record the ratio of prior use during the model's distance computation in the simple setting of interacting sequentially with two different environments. :param run_experiment: (bool) set to False for plot only :param open_plot: (bool) set to False to disable plot (only saving) :param verbose: (bool) :return: None """ w = 4 h = 4 walls = [(2, 2), (3, 2), (4, 2), (2, 4)] env1 = HeatMap(width=w, height=h, init_loc=(1, 1), goal_locs=[(w, h)], is_goal_terminal=False, walls=walls, slip_prob=0.1, goal_reward=1.0, reward_span=1.0) env2 = HeatMap(width=w, height=h, init_loc=(1, 1), goal_locs=[(w - 1, h)], is_goal_terminal=False, walls=walls, slip_prob=0.05, goal_reward=0.6, reward_span=1.5) # Compute needed number of samples for L-R-MAX to achieve epsilon optimal behavior with probability (1 - delta) epsilon = .1 delta = .05 m_r = np.log(2. / delta) / (2. * epsilon**2) m_t = 2. * (np.log(2**(float(w * h) - float(len(walls))) - 2.) - np.log(delta)) / (epsilon**2) m = int(max(m_r, m_t)) names = [] for p in PRIORS: results = [] name = 'default' for i in range(N_INSTANCES): agent = LRMaxExp(actions=env1.get_actions(), gamma=GAMMA, count_threshold=m, epsilon=epsilon, prior=p) name = agent.name if run_experiment: if verbose: print('Running instance', i + 1, 'of', N_INSTANCES, 'for agent', name) run_single_agent_on_mdp(agent, env1, episodes=N_EPISODES, steps=N_STEPS, experiment=None, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False) agent.reset() run_single_agent_on_mdp(agent, env2, episodes=N_EPISODES, steps=N_STEPS, experiment=None, verbose=False, track_disc_reward=False, reset_at_terminal=False, resample_at_terminal=False) results.append(agent.get_results()) names.append(name) # Save results if run_experiment: utils.save_result(results, ROOT_PATH, name) # Plot utils.plot_computation_number_results(ROOT_PATH, names, open_plot) utils.plot_time_step_results(ROOT_PATH, names, open_plot)