Пример #1
0
def build_subgoal_option_agent(mdp,
                               subgoals,
                               init_region,
                               agent=QLearningAgent,
                               vectors=None,
                               name='-abstr',
                               n_trajs=50,
                               n_steps=100,
                               classifier='list',
                               policy='vi'):
    # print('sbugoals=', subgoals)
    goal_based_options = aa_helpers.make_subgoal_options(mdp,
                                                         subgoals,
                                                         init_region,
                                                         vectors=vectors,
                                                         n_trajs=n_trajs,
                                                         n_steps=n_steps,
                                                         classifier=classifier,
                                                         policy=policy)
    goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(),
                                      options=goal_based_options,
                                      use_prims=True)

    # num_feats = mdp.get_num_state_feats()
    option_agent = AbstractionWrapper(
        agent,
        agent_params={"actions": mdp.get_actions()},
        action_abstr=goal_based_aa,
        name_ext=name)

    return option_agent
Пример #2
0
def build_point_option_agent(mdp,
                             pairs,
                             agent=QLearningAgent,
                             policy='vi',
                             name='-abstr'):
    # pairs should be a List of pair.
    # Pair is conposed of two lists.
    # Each list has init/term states.
    goal_based_options = aa_helpers.make_point_options(mdp,
                                                       pairs,
                                                       policy=policy)
    goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(),
                                      options=goal_based_options,
                                      use_prims=True)

    # num_feats = mdp.get_num_state_feats()
    option_agent = AbstractionWrapper(
        agent,
        agent_params={"actions": mdp.get_actions()},
        action_abstr=goal_based_aa,
        name_ext=name)
    # option_agent = AbstractionWrapper(LinearQAgent, agent_params={"actions":mdp.get_actions(), "num_features":num_feats}, action_abstr=goal_based_aa, name_ext=name)
    # option_agent = AbstractionWrapper(QLearningAgent, agent_params={"actions":mdp.get_actions()}, action_abstr=goal_based_aa, name_ext=name)

    return option_agent
def main(open_plot=True):
    # Setup MDP, Agents.
    mdp_distr = make_mdp.make_mdp_distr(mdp_class="four_room")
    ql_agent = QLearningAgent(actions=mdp_distr.get_actions())
    rand_agent = RandomAgent(actions=mdp_distr.get_actions())

    # Make goal-based option agent.
    goal_based_options = aa_helpers.make_goal_based_options(mdp_distr)
    goal_based_aa = ActionAbstraction(prim_actions=mdp_distr.get_actions(),
                                      options=goal_based_options)
    option_agent = AbstractionWrapper(QLearningAgent,
                                      actions=mdp_distr.get_actions(),
                                      action_abstr=goal_based_aa)

    # Run experiment and make plot.
    run_agents_lifelong([ql_agent, rand_agent, option_agent],
                        mdp_distr,
                        samples=10,
                        episodes=100,
                        steps=150,
                        open_plot=open_plot)
Пример #4
0
def build_online_subgoal_option_agent(mdp,
                                      agent=QLearningAgent,
                                      n_ops=4,
                                      freqs=100,
                                      op_n_episodes=10,
                                      op_n_steps=10,
                                      method='eigen',
                                      name='-online-op'):
    goal_based_aa = ActionAbstraction(prim_actions=mdp.get_actions(),
                                      use_prims=True)

    option_agent = OnlineAbstractionWrapper(
        agent,
        agent_params={"actions": mdp.get_actions()},
        action_abstr=goal_based_aa,
        name_ext=name,
        n_ops=n_ops,
        freqs=freqs,
        op_n_episodes=op_n_episodes,
        op_n_steps=op_n_steps,
        method=method,
        mdp=mdp)

    return option_agent
    def __init__(self,
                 ground_mdp,
                 state_abstr=None,
                 action_abstr=None,
                 sample_rate=10,
                 delta=0.001,
                 max_iterations=1000):
        '''
        Args:
            ground_mdp (MDP)
            state_abstr (StateAbstraction)
            action_abstr (ActionAbstraction)
        '''
        self.ground_mdp = ground_mdp
        self.state_abstr = state_abstr if state_abstr not in [
            [], None
        ] else StateAbstraction()
        self.action_abstr = action_abstr if action_abstr not in [
            [], None
        ] else ActionAbstraction(prim_actions=ground_mdp.get_actions())

        mdp = make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr)

        ValueIteration.__init__(self, mdp, sample_rate, delta, max_iterations)


#         self.delta = delta
#         self.max_iterations = max_iterations
#         self.sample_rate = sample_rate

#         self.value_func = defaultdict(float)
#         self.reachability_done = False
#         self.has_run_vi = False
#         self._compute_reachable_state_space()

#     def get_num_states(self):
#         return len(self.states)

#     def get_states(self):
#         if self.reachability_done:
#             return self.states
#         else:
#             self._compute_reachable_state_space()
#             return self.states

#     def _compute_reachable_state_space(self):
#         '''
#         Summary:
#             Starting with @self.start_state, determines all reachable states
#             and stores their abstracted counterparts in self.states.
#         '''
#         state_queue = Queue.Queue()
#         s_g_init = self.mdp.get_init_state()
#         s_a_init = self.state_abstr.phi(s_g_init)
#         state_queue.put(s_g_init)
#         self.states.add(s_a_init)
#         ground_t = self.mdp.get_transition_func()

#         while not state_queue.empty():
#             ground_state = state_queue.get()
#             for option in self.action_abstr.get_active_options(ground_state):
#                 # For each active option.

#                 # Take @sample_rate samples to estimate E[V]
#                 for samples in xrange(self.sample_rate):

#                     next_g_state = option.act_until_terminal(ground_state, ground_t)

#                     if next_g_state not in self.states:
#                         next_a_state = self.state_abstr.phi(next_g_state)
#                         self.states.add(next_a_state)
#                         state_queue.put(next_g_state)

#         self.reachability_done = True

#     def plan(self, ground_state=None, horizon=100):
#         '''
#         Args:
#             ground_state (State)
#             horizon (int)

#         Returns:
#             (tuple):
#                 (list): List of primitive actions taken.
#                 (list): List of ground states.
#                 (list): List of abstract actions taken.
#         '''

#         ground_state = self.mdp.get_init_state() if ground_state is None else ground_state

#         if self.has_run_vi is False:
#             print "Warning: VI has not been run. Plan will be random."

#         primitive_action_seq = []
#         abstr_action_seq = []
#         state_seq = [ground_state]
#         steps = 0

#         ground_t = self.transition_func

#         # Until terminating condition is met.
#         while (not ground_state.is_terminal()) and steps < horizon:

#             # Compute best action, roll it out.
#             next_option = self._get_max_q_action(ground_state)

#             while not next_option.is_term_true(ground_state):
#                 # Keep applying option until it terminates.
#                 abstr_state = self.state_abstr.phi(ground_state)
#                 ground_action = next_option.act(ground_state)
#                 ground_state = ground_t(ground_state, ground_action)
#                 steps += 1
#                 primitive_action_seq.append(ground_action)

#                 state_seq.append(ground_state)

#             abstr_action_seq.append(next_option)

#         return primitive_action_seq, state_seq, abstr_action_seq

#     def run_vi(self):
#         '''
#         Summary:
#             Runs ValueIteration and fills in the self.value_func.
#         '''
#         # Algorithm bookkeeping params.
#         iterations = 0
#         max_diff = float("inf")

#         # Main loop.
#         while max_diff > self.delta and iterations < self.max_iterations:
#             max_diff = 0
#             for s_g in self.get_states():
#                 if s_g.is_terminal():
#                     continue

#                 max_q = float("-inf")
#                 for a in self.action_abstr.get_active_options(s_g):
#                     # For each active option, compute it's q value.
#                     q_s_a = self.get_q_value(s_g, a)
#                     max_q = q_s_a if q_s_a > max_q else max_q

#                 # Check terminating condition.
#                 max_diff = max(abs(self.value_func[s_g] - max_q), max_diff)

#                 # Update value.
#                 self.value_func[s_g] = max_q

#             iterations += 1

#         value_of_init_state = self._compute_max_qval_action_pair(self.init_state)[0]

#         self.has_run_vi = True

#         return iterations, value_of_init_state

#     def get_q_value(self, s_g, option):
#         '''
#         Args:
#             s (State)
#             a (Option): Assumed active option.

#         Returns:
#             (float): The Q estimate given the current value function @self.value_func.
#         '''

#         # Take samples and track next state counts.
#         next_state_counts = defaultdict(int)
#         reward_total = 0
#         for samples in xrange(self.sample_rate): # Take @sample_rate samples to estimate E[V]
#             next_state, reward, num_steps = self.do_rollout(option, s_g)
#             next_state_counts[next_state] += 1
#             reward_total += reward

#         # Compute T(s' | s, option) estimate based on MLE and R(s, option).
#         next_state_probs = defaultdict(float)
#         avg_reward = 0
#         for state in next_state_counts:
#             next_state_probs[state] = float(next_state_counts[state]) / self.sample_rate

#         avg_reward = float(reward_total) / self.sample_rate

#         # Compute expected value.
#         expected_future_val = 0
#         for state in next_state_probs:
#             expected_future_val += next_state_probs[state] * self.value_func[state]

#         return avg_reward + self.gamma*expected_future_val

#     def do_rollout(self, option, ground_state):
#         '''
#         Args:
#             option (Option)
#             ground_state (State)

#         Returns:
#             (tuple):
#                 (State): Next ground state.
#                 (float): Reward.
#                 (int): Number of steps taken.
#         '''

#         ground_t = self.mdp.get_transition_func()
#         ground_r = self.mdp.get_reward_func()

#         if type(option) is str:
#             ground_action = option
#         else:
#             ground_action = option.act(ground_state)
#         total_reward = ground_r(ground_state, ground_action)
#         ground_state = ground_t(ground_state, ground_action)

#         total_steps = 1
#         while type(option) is not str and not option.is_term_true(ground_state):
#             # Keep applying option until it terminates.
#             ground_action = option.act(ground_state)
#             total_reward += ground_r(ground_state, ground_action)
#             ground_state = ground_t(ground_state, ground_action)
#             total_steps += 1

#         return ground_state, total_reward, total_steps

#     def _compute_max_qval_action_pair(self, state):
#         '''
#         Args:
#             state (State)

#         Returns:
#             (tuple) --> (float, str): where the float is the Qval, str is the action.
#         '''
#         # Grab random initial action in case all equal
#         max_q_val = float("-inf")
#         shuffled_option_list = self.action_abstr.get_active_options(state)[:]
#         if len(shuffled_option_list) == 0:
#         	# Prims on failure.
#         	shuffled_option_list = self.mdp.get_actions()

#         random.shuffle(shuffled_option_list)
#         best_action = shuffled_option_list[0]

#         # Find best action (action w/ current max predicted Q value)
#         for option in shuffled_option_list:
#             q_s_a = self.get_q_value(state, option)
#             if q_s_a > max_q_val:
#                 max_q_val = q_s_a
#                 best_action = option

#         return max_q_val, best_action

#     def _get_max_q_action(self, state):
#         '''
#         Args:
#             state (State)

#         Returns:
#             (str): denoting the action with the max q value in the given @state.
#         '''
#         return self._compute_max_qval_action_pair(state)[1]

#     def policy(self, state):
#         '''
#         Args:
#             state (State)

#         Returns:
#             (str): Action

#         Summary:
#             For use in a FixedPolicyAgent.
#         '''
#         return self._get_max_q_action(state)

# def main():
#     # MDP Setting.
#     multi_task = False
#     mdp_class = "grid"

#     # Make single/multi task environment.
#     environment = make_mdp.make_mdp_distr(mdp_class=mdp_class, num_mdps=3, horizon=30) if multi_task else make_mdp.make_mdp(mdp_class=mdp_class)
#     actions = environment.get_actions()
#     gamma = environment.get_gamma()

#     directed_sa, directed_aa = ae.get_abstractions(environment, directed=True)
#     default_sa, default_aa = ae.get_sa(environment, default=True), ae.get_aa(environment, default=True)

#     vi = ValueIteration(environment)
#     avi = AbstractValueIteration(environment, state_abstr=default_sa, action_abstr=default_aa)

#     a_num_iters, a_val = avi.run_vi()
#     g_num_iters, g_val = vi.run_vi()

#     print "a", a_num_iters, a_val
#     print "g", g_num_iters, g_val

# if __name__ == "__main__":
#     main()