def build_subgoal_option_agent(mdp, subgoals, init_region, agent=QLearningAgent, vectors=None, name='-abstr', n_trajs=50, n_steps=100, classifier='list', policy='vi'): # print('sbugoals=', subgoals) goal_based_options = aa_helpers.make_subgoal_options(mdp, subgoals, init_region, vectors=vectors, n_trajs=n_trajs, n_steps=n_steps, classifier=classifier, policy=policy) goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(), options=goal_based_options, use_prims=True) # num_feats = mdp.get_num_state_feats() option_agent = AbstractionWrapper( agent, agent_params={"actions": mdp.get_actions()}, action_abstr=goal_based_aa, name_ext=name) return option_agent
def build_point_option_agent(mdp, pairs, agent=QLearningAgent, policy='vi', name='-abstr'): # pairs should be a List of pair. # Pair is conposed of two lists. # Each list has init/term states. goal_based_options = aa_helpers.make_point_options(mdp, pairs, policy=policy) goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(), options=goal_based_options, use_prims=True) # num_feats = mdp.get_num_state_feats() option_agent = AbstractionWrapper( agent, agent_params={"actions": mdp.get_actions()}, action_abstr=goal_based_aa, name_ext=name) # option_agent = AbstractionWrapper(LinearQAgent, agent_params={"actions":mdp.get_actions(), "num_features":num_feats}, action_abstr=goal_based_aa, name_ext=name) # option_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions":mdp.get_actions()}, action_abstr=goal_based_aa, name_ext=name) return option_agent
def main(open_plot=True): # Setup MDP, Agents. mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room") ql_agent = QLearningAgent(actions=mdp_distr.get_actions()) rand_agent = RandomAgent(actions=mdp_distr.get_actions()) # Make goal-based option agent. goal_based_options = aa_helpers.make_goal_based_options(mdp_distr) goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(), options=goal_based_options) option_agent = AbstractionWrapper(QLearningAgent, actions=mdp_distr.get_actions(), action_abstr=goal_based_aa) # Run experiment and make plot. run_agents_lifelong([ql_agent, rand_agent, option_agent], mdp_distr, samples=10, episodes=100, steps=150, open_plot=open_plot)
def build_online_subgoal_option_agent(mdp, agent=QLearningAgent, n_ops=4, freqs=100, op_n_episodes=10, op_n_steps=10, method='eigen', name='-online-op'): goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(), use_prims=True) option_agent = OnlineAbstractionWrapper( agent, agent_params={"actions": mdp.get_actions()}, action_abstr=goal_based_aa, name_ext=name, n_ops=n_ops, freqs=freqs, op_n_episodes=op_n_episodes, op_n_steps=op_n_steps, method=method, mdp=mdp) return option_agent
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, sample_rate=10, delta=0.001, max_iterations=1000): ''' Args: ground_mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) ''' self.ground_mdp = ground_mdp self.state_abstr = state_abstr if state_abstr not in [ [], None ] else StateAbstraction() self.action_abstr = action_abstr if action_abstr not in [ [], None ] else ActionAbstraction(prim_actions=ground_mdp.get_actions()) mdp = make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr) ValueIteration.__init__(self, mdp, sample_rate, delta, max_iterations) # self.delta = delta # self.max_iterations = max_iterations # self.sample_rate = sample_rate # self.value_func = defaultdict(float) # self.reachability_done = False # self.has_run_vi = False # self._compute_reachable_state_space() # def get_num_states(self): # return len(self.states) # def get_states(self): # if self.reachability_done: # return self.states # else: # self._compute_reachable_state_space() # return self.states # def _compute_reachable_state_space(self): # ''' # Summary: # Starting with @self.start_state, determines all reachable states # and stores their abstracted counterparts in self.states. # ''' # state_queue = Queue.Queue() # s_g_init = self.mdp.get_init_state() # s_a_init = self.state_abstr.phi(s_g_init) # state_queue.put(s_g_init) # self.states.add(s_a_init) # ground_t = self.mdp.get_transition_func() # while not state_queue.empty(): # ground_state = state_queue.get() # for option in self.action_abstr.get_active_options(ground_state): # # For each active option. # # Take @sample_rate samples to estimate E[V] # for samples in xrange(self.sample_rate): # next_g_state = option.act_until_terminal(ground_state, ground_t) # if next_g_state not in self.states: # next_a_state = self.state_abstr.phi(next_g_state) # self.states.add(next_a_state) # state_queue.put(next_g_state) # self.reachability_done = True # def plan(self, ground_state=None, horizon=100): # ''' # Args: # ground_state (State) # horizon (int) # Returns: # (tuple): # (list): List of primitive actions taken. # (list): List of ground states. # (list): List of abstract actions taken. # ''' # ground_state = self.mdp.get_init_state() if ground_state is None else ground_state # if self.has_run_vi is False: # print "Warning: VI has not been run. Plan will be random." # primitive_action_seq = [] # abstr_action_seq = [] # state_seq = [ground_state] # steps = 0 # ground_t = self.transition_func # # Until terminating condition is met. # while (not ground_state.is_terminal()) and steps < horizon: # # Compute best action, roll it out. # next_option = self._get_max_q_action(ground_state) # while not next_option.is_term_true(ground_state): # # Keep applying option until it terminates. # abstr_state = self.state_abstr.phi(ground_state) # ground_action = next_option.act(ground_state) # ground_state = ground_t(ground_state, ground_action) # steps += 1 # primitive_action_seq.append(ground_action) # state_seq.append(ground_state) # abstr_action_seq.append(next_option) # return primitive_action_seq, state_seq, abstr_action_seq # def run_vi(self): # ''' # Summary: # Runs ValueIteration and fills in the self.value_func. # ''' # # Algorithm bookkeeping params. # iterations = 0 # max_diff = float("inf") # # Main loop. # while max_diff > self.delta and iterations < self.max_iterations: # max_diff = 0 # for s_g in self.get_states(): # if s_g.is_terminal(): # continue # max_q = float("-inf") # for a in self.action_abstr.get_active_options(s_g): # # For each active option, compute it's q value. # q_s_a = self.get_q_value(s_g, a) # max_q = q_s_a if q_s_a > max_q else max_q # # Check terminating condition. # max_diff = max(abs(self.value_func[s_g] - max_q), max_diff) # # Update value. # self.value_func[s_g] = max_q # iterations += 1 # value_of_init_state = self._compute_max_qval_action_pair(self.init_state)[0] # self.has_run_vi = True # return iterations, value_of_init_state # def get_q_value(self, s_g, option): # ''' # Args: # s (State) # a (Option): Assumed active option. # Returns: # (float): The Q estimate given the current value function @self.value_func. # ''' # # Take samples and track next state counts. # next_state_counts = defaultdict(int) # reward_total = 0 # for samples in xrange(self.sample_rate): # Take @sample_rate samples to estimate E[V] # next_state, reward, num_steps = self.do_rollout(option, s_g) # next_state_counts[next_state] += 1 # reward_total += reward # # Compute T(s' | s, option) estimate based on MLE and R(s, option). # next_state_probs = defaultdict(float) # avg_reward = 0 # for state in next_state_counts: # next_state_probs[state] = float(next_state_counts[state]) / self.sample_rate # avg_reward = float(reward_total) / self.sample_rate # # Compute expected value. # expected_future_val = 0 # for state in next_state_probs: # expected_future_val += next_state_probs[state] * self.value_func[state] # return avg_reward + self.gamma*expected_future_val # def do_rollout(self, option, ground_state): # ''' # Args: # option (Option) # ground_state (State) # Returns: # (tuple): # (State): Next ground state. # (float): Reward. # (int): Number of steps taken. # ''' # ground_t = self.mdp.get_transition_func() # ground_r = self.mdp.get_reward_func() # if type(option) is str: # ground_action = option # else: # ground_action = option.act(ground_state) # total_reward = ground_r(ground_state, ground_action) # ground_state = ground_t(ground_state, ground_action) # total_steps = 1 # while type(option) is not str and not option.is_term_true(ground_state): # # Keep applying option until it terminates. # ground_action = option.act(ground_state) # total_reward += ground_r(ground_state, ground_action) # ground_state = ground_t(ground_state, ground_action) # total_steps += 1 # return ground_state, total_reward, total_steps # def _compute_max_qval_action_pair(self, state): # ''' # Args: # state (State) # Returns: # (tuple) --> (float, str): where the float is the Qval, str is the action. # ''' # # Grab random initial action in case all equal # max_q_val = float("-inf") # shuffled_option_list = self.action_abstr.get_active_options(state)[:] # if len(shuffled_option_list) == 0: # # Prims on failure. # shuffled_option_list = self.mdp.get_actions() # random.shuffle(shuffled_option_list) # best_action = shuffled_option_list[0] # # Find best action (action w/ current max predicted Q value) # for option in shuffled_option_list: # q_s_a = self.get_q_value(state, option) # if q_s_a > max_q_val: # max_q_val = q_s_a # best_action = option # return max_q_val, best_action # def _get_max_q_action(self, state): # ''' # Args: # state (State) # Returns: # (str): denoting the action with the max q value in the given @state. # ''' # return self._compute_max_qval_action_pair(state)[1] # def policy(self, state): # ''' # Args: # state (State) # Returns: # (str): Action # Summary: # For use in a FixedPolicyAgent. # ''' # return self._get_max_q_action(state) # def main(): # # MDP Setting. # multi_task = False # mdp_class = "grid" # # Make single/multi task environment. # environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, num_mdps=3, horizon=30) if multi_task else make_mdp.make_mdp(mdp_class=mdp_class) # actions = environment.get_actions() # gamma = environment.get_gamma() # directed_sa, directed_aa = ae.get_abstractions(environment, directed=True) # default_sa, default_aa = ae.get_sa(environment, default=True), ae.get_aa(environment, default=True) # vi = ValueIteration(environment) # avi = AbstractValueIteration(environment, state_abstr=default_sa, action_abstr=default_aa) # a_num_iters, a_val = avi.run_vi() # g_num_iters, g_val = vi.run_vi() # print "a", a_num_iters, a_val # print "g", g_num_iters, g_val # if __name__ == "__main__": # main()