def compute_sub_opt_func_for_mdp_distr(mdp_distr): ''' Args: mdp_distr (dict) Returns: (list): Contains the suboptimality function for each MDP in mdp_distr. subopt: V^*(s) - Q^(s,a) ''' actions = mdp_distr.get_actions() sub_opt_funcs = [] i = 0 for mdp in mdp_distr.get_mdps(): print "\t mdp", i + 1, "of", mdp_distr.get_num_mdps() vi = ValueIteration(mdp, delta=0.001, max_iterations=1000) iters, value = vi.run_vi() new_sub_opt_func = defaultdict(float) for s in vi.get_states(): max_q = float("-inf") for a in actions: next_q = vi.get_q_value(s, a) if next_q > max_q: max_q = next_q for a in actions: new_sub_opt_func[(s, a)] = max_q - vi.get_q_value(s, a) sub_opt_funcs.append(new_sub_opt_func) i += 1 return sub_opt_funcs
def make_goal_based_options(mdp_distr): ''' Args: mdp_distr (MDPDistribution) Returns: (list): Contains Option instances. ''' goal_list = set([]) for mdp in mdp_distr.get_all_mdps(): vi = ValueIteration(mdp) state_space = vi.get_states() for s in state_space: if s.is_terminal(): goal_list.add(s) options = set([]) for mdp in mdp_distr.get_all_mdps(): init_predicate = Predicate(func=lambda x: True) term_predicate = InListPredicate(ls=goal_list) o = Option(init_predicate=init_predicate, term_predicate=term_predicate, policy=_make_mini_mdp_option_policy(mdp), term_prob=0.0) options.add(o) return options
def make_multitask_sa(mdp_distr, state_class=State, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.0, aa_single_act=True, track_act_opt_pr=False): ''' Args: mdp_distr (MDPDistribution) state_class (Class) indicator_func (S x S --> {0,1}) epsilon (float) aa_single_act (bool): If we should track optimal actions. Returns: (StateAbstraction) ''' sa_list = [] for mdp in mdp_distr.get_mdps(): sa = make_singletask_sa(mdp, indic_func, state_class, epsilon, aa_single_act=aa_single_act, prob_of_mdp=mdp_distr.get_prob_of_mdp(mdp), track_act_opt_pr=track_act_opt_pr) sa_list += [sa] mdp = mdp_distr.get_all_mdps()[0] vi = ValueIteration(mdp) ground_states = vi.get_states() multitask_sa = merge_state_abstr(sa_list, ground_states) return multitask_sa
def compute_avg_mdp(mdp_distr, sample_rate=5): ''' Args: mdp_distr (defaultdict) Returns: (MDP) ''' # Get normal components. init_state = mdp_distr.get_init_state() actions = mdp_distr.get_actions() gamma = mdp_distr.get_gamma() T = mdp_distr.get_all_mdps()[0].get_transition_func() # Compute avg reward. avg_rew = defaultdict(lambda: defaultdict(float)) avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict( float))) # Stores T_i(s,a,s') * Pr(M_i) for mdp in mdp_distr.get_mdps(): prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) # Get a vi instance to compute state space. vi = ValueIteration(mdp, delta=0.0001, max_iterations=2000, sample_rate=sample_rate) iters, value = vi.run_vi() states = vi.get_states() for s in states: for a in actions: r = mdp.reward_func(s, a) avg_rew[s][a] += prob_of_mdp * r for repeat in range(sample_rate): s_prime = mdp.transition_func(s, a) avg_trans_counts[s][a][s_prime] += prob_of_mdp avg_trans_probs = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) for s in avg_trans_counts.keys(): for a in actions: for s_prime in avg_trans_counts[s][a].keys(): avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][ s_prime] / sum(avg_trans_counts[s][a].values()) def avg_rew_func(s, a): return avg_rew[s][a] avg_trans_func = T avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma) return avg_mdp
def _make_mini_mdp_option_policy(mini_mdp): ''' Args: mini_mdp (MDP) Returns: Policy ''' # Solve the MDP defined by the terminal abstract state. mini_mdp_vi = ValueIteration(mini_mdp, delta=0.001, max_iterations=1000, sample_rate=10) iters, val = mini_mdp_vi.run_vi() o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states()) o_policy = PolicyFromDict(o_policy_dict) return o_policy.get_action
def make_random_sa_stack(mdp_distr, cluster_size_ratio=0.5, max_num_levels=2): ''' Args: mdp_distr (MDPDistribution) cluster_size_ratio (float): A float in (0,1) that determines the size of the abstract state space. max_num_levels (int): Determines the _total_ number of levels in the hierarchy (includes ground). Returns: (StateAbstraction) ''' # Get ground state space. vi = ValueIteration(mdp_distr.get_all_mdps()[0], delta=0.0001, max_iterations=5000) ground_state_space = vi.get_states() sa_stack = StateAbstractionStack(list_of_phi=[]) # Each loop adds a stack. for i in range(max_num_levels - 1): # Grab curent state space (at level i). cur_state_space = _get_level_i_state_space(ground_state_space, sa_stack, i) cur_state_space_size = len(cur_state_space) if int(cur_state_space_size / cluster_size_ratio) <= 1: # The abstract is as small as it can get. break # Add the mapping. new_phi = {} for s in cur_state_space: new_phi[s] = HierarchyState(data=random.randint( 1, max(int(cur_state_space_size * cluster_size_ratio), 1)), level=i + 1) if len(set(new_phi.values())) <= 1: # The abstract is as small as it can get. break # Add the sa to the stack. sa_stack.add_phi(new_phi) return sa_stack
def _make_mini_mdp_option_policy(mini_mdp): ''' Args: mini_mdp (MDP) Returns: Policy ''' # Solve the MDP defined by the terminal abstract state. mini_mdp_vi = ValueIteration(mini_mdp, delta=0.005, max_iterations=1000, sample_rate=30) iters, val = mini_mdp_vi.run_vi() o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states()) o_policy = PolicyFromDict(o_policy_dict) return o_policy.get_action, mini_mdp_vi
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) vi_sample_rate (int): Num samples per transition for running VI. max_iterations (int): Usual VI # Iteration bound. amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract. ''' self.ground_mdp = ground_mdp # Grab ground state space. vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5) state_space = vi.get_states() # Make the abstract MDP. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction( ground_state_space=state_space) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction( prim_actions=ground_mdp.get_actions()) abstr_mdp = abstr_mdp_funcs.make_abstr_mdp( ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate) # Create VI with the abstract MDP. ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
def get_distance(mdp, epsilon=0.05): vi = ValueIteration(mdp) vi.run_vi() vstar = vi.value_func # dictionary of state -> float states = vi.get_states() # list of state distance = defaultdict(lambda: defaultdict(float)) v_df = ValueIterationDist(mdp, vstar) v_df.run_vi() d_to_s = v_df.distance for t in states: for s in states: distance[t][s] = max(d_to_s[t] - 1, 0) for s in states: # s: state vis = ValueIterationDist(mdp, vstar) vis.add_fixed_val(s, vstar[s]) vis.run_vi() d_to_s = vis.distance for t in states: distance[t][s] = min(d_to_s[t], distance[t][s]) sToInd = OrderedDict() indToS = OrderedDict() for i, s in enumerate(states): sToInd[s] = i indToS[i] = s d = np.zeros((len(states), len(states)), dtype=int) # print "type(d)=", type(d) # print "d.shape=", d.shape for s in states: for t in states: # print 's, t=', index[s], index[t] d[sToInd[s]][sToInd[t]] = distance[s][t] return sToInd, indToS, d
def compute_optimal_stoch_policy(mdp_distr): ''' Args: mdp_distr (defaultdict) Returns: (lambda) ''' # Key: state # Val: dict # Key: action # Val: probability policy_dict = defaultdict(lambda: defaultdict(float)) # Compute optimal policy for each MDP. for mdp in mdp_distr.get_all_mdps(): # Solve the MDP and get the optimal policy. vi = ValueIteration(mdp, delta=0.001, max_iterations=1000) iters, value = vi.run_vi() vi_policy = vi.policy states = vi.get_states() # Compute the probability each action is optimal in each state. prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) for s in states: a_star = vi_policy(s) policy_dict[s][a_star] += prob_of_mdp # Create the lambda. def policy_from_dict(state): action_id = np.random.multinomial( 1, policy_dict[state].values()).tolist().index(1) action = policy_dict[state].keys()[action_id] return action return policy_from_dict
def __init__(self, ground_mdp, state_abstr=None, action_abstr=None, vi_sample_rate=5, max_iterations=1000, amdp_sample_rate=5, delta=0.001): ''' Args: ground_mdp (simple_rl.MDP) state_abstr (simple_rl.StateAbstraction) action_abstr (simple_rl.ActionAbstraction) vi_sample_rate (int): Num samples per transition for running VI. max_iterations (int): Usual VI # Iteration bound. amdp_sample_rate (int): Num samples per abstract transition to use for computing R_abstract, T_abstract. ''' self.ground_mdp = ground_mdp # Grab ground state space. vi = ValueIteration(self.ground_mdp, delta=0.001, max_iterations=1000, sample_rate=5) state_space = vi.get_states() # Make the abstract MDP. self.state_abstr = state_abstr if state_abstr is not None else StateAbstraction(ground_state_space=state_space) self.action_abstr = action_abstr if action_abstr is not None else ActionAbstraction(prim_actions=ground_mdp.get_actions()) abstr_mdp = abstr_mdp_funcs.make_abstr_mdp(ground_mdp, self.state_abstr, self.action_abstr, step_cost=0.0, sample_rate=amdp_sample_rate) # Create VI with the abstract MDP. ValueIteration.__init__(self, abstr_mdp, vi_sample_rate, delta, max_iterations)
def main(): # Setup environment. mdp_class, agent_type, samples = parse_args() is_goal_terminal = False mdp_distr = make_mdp_distr(mdp_class=mdp_class, is_goal_terminal=is_goal_terminal) mdp_distr.set_gamma(0.99) actions = mdp_distr.get_actions() # Compute priors. # Stochastic mixture. mdp_distr_copy = copy.deepcopy(mdp_distr) opt_stoch_policy = ape.compute_optimal_stoch_policy(mdp_distr_copy) # Avg MDP avg_mdp = ape.compute_avg_mdp(mdp_distr) avg_mdp_vi = ValueIteration(avg_mdp, delta=0.001, max_iterations=1000, sample_rate=5) iters, value = avg_mdp_vi.run_vi() # Make agents. # Q Learning ql_agent = QLearnerAgent(actions) shaped_ql_agent_prior = ShapedQAgent(shaping_policy=opt_stoch_policy, actions=actions, name="Prior-QLearning") shaped_ql_agent_avgmdp = ShapedQAgent(shaping_policy=avg_mdp_vi.policy, actions=actions, name="AvgMDP-QLearning") # RMax rmax_agent = RMaxAgent(actions) shaped_rmax_agent_prior = ShapedRMaxAgent( shaping_policy=opt_stoch_policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="Prior-RMax") shaped_rmax_agent_avgmdp = ShapedRMaxAgent( shaping_policy=avg_mdp_vi.policy, state_space=avg_mdp_vi.get_states(), actions=actions, name="AvgMDP-RMax") prune_rmax_agent = PruneRMaxAgent(mdp_distr=mdp_distr) if agent_type == "rmax": agents = [ rmax_agent, shaped_rmax_agent_prior, shaped_rmax_agent_avgmdp, prune_rmax_agent ] else: agents = [ql_agent, shaped_ql_agent_prior, shaped_ql_agent_avgmdp] # Run task. run_agents_multi_task(agents, mdp_distr, task_samples=samples, episodes=1, steps=200, is_rec_disc_reward=False, verbose=True)
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print("\tRunning VI...", ) sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print(" done.") print("\tMaking state abstraction...", ) sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr) clusters = defaultdict(list) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x] = [state_x] for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func( state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].append(state_y) clusters[state_y].append(state_x) print("making clusters...", ) sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) if aa_single_act: # Put all optimal actions in a set associated with the ground state. for ground_s in sa.get_ground_states(): a_star_set = set(vi.get_max_q_actions(ground_s)) sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp) print(" done.") print("\tGround States:", num_states) print("\tAbstract:", sa.get_num_abstr_states()) print() return sa
class BeliefUpdater(object): ''' Wrapper class for different methods for belief state updates in POMDPs. ''' def __init__(self, mdp, transition_func, reward_func, observation_func, updater_type='discrete'): ''' Args: mdp (POMDP) transition_func: T(s, a) --> s' reward_func: R(s, a) --> float observation_func: O(s, a) --> z updater_type (str) ''' self.reward_func = reward_func self.updater_type = updater_type # We use the ValueIteration class to construct the transition and observation probabilities self.vi = ValueIteration(mdp, sample_rate=500) self.transition_probs = self.construct_transition_matrix(transition_func) self.observation_probs = self.construct_observation_matrix(observation_func, transition_func) if updater_type == 'discrete': self.updater = self.discrete_filter_updater elif updater_type == 'kalman': self.updater = self.kalman_filter_updater elif updater_type == 'particle': self.updater = self.particle_filter_updater else: raise AttributeError('updater_type {} did not conform to expected type'.format(updater_type)) def discrete_filter_updater(self, belief, action, observation): def _compute_normalization_factor(bel): return sum(bel.values()) def _update_belief_for_state(b, sp, T, O, a, z): return O[sp][z] * sum([T[s][a][sp] * b[s] for s in b]) new_belief = defaultdict() for sprime in belief: new_belief[sprime] = _update_belief_for_state(belief, sprime, self.transition_probs, self.observation_probs, action, observation) normalization = _compute_normalization_factor(new_belief) for sprime in belief: if normalization > 0: new_belief[sprime] /= normalization return new_belief def kalman_filter_updater(self, belief, action, observation): pass def particle_filter_updater(self, belief, action, observation): pass def construct_transition_matrix(self, transition_func): ''' Create an MLE of the transition probabilities by sampling from the transition_func multiple times. Args: transition_func: T(s, a) -> s' Returns: transition_probabilities (defaultdict): T(s, a, s') --> float ''' self.vi._compute_matrix_from_trans_func() return self.vi.trans_dict def construct_observation_matrix(self, observation_func, transition_func): ''' Create an MLE of the observation probabilities by sampling from the observation_func multiple times. Args: observation_func: O(s) -> z transition_func: T(s, a) -> s' Returns: observation_probabilities (defaultdict): O(s, z) --> float ''' def normalize_probabilities(odict): norm_factor = sum(odict.values()) for obs in odict: odict[obs] /= norm_factor return odict obs_dict = defaultdict(lambda:defaultdict(float)) for state in self.vi.get_states(): for action in self.vi.mdp.actions: for sample in range(self.vi.sample_rate): observation = observation_func(state, action) next_state = transition_func(state, action) obs_dict[next_state][observation] += 1. / self.vi.sample_rate for state in self.vi.get_states(): obs_dict[state] = normalize_probabilities(obs_dict[state]) return obs_dict
class BeliefUpdater(object): ''' Wrapper class for different methods for belief state updates in POMDPs. ''' def __init__(self, mdp, transition_func, reward_func, observation_func, updater_type='discrete'): ''' Args: mdp (POMDP) transition_func: T(s, a) --> s' reward_func: R(s, a) --> float observation_func: O(s, a) --> z updater_type (str) ''' self.reward_func = reward_func self.updater_type = updater_type # We use the ValueIteration class to construct the transition and observation probabilities self.vi = ValueIteration(mdp, sample_rate=500) self.transition_probs = self.construct_transition_matrix( transition_func) self.observation_probs = self.construct_observation_matrix( observation_func, transition_func) if updater_type == 'discrete': self.updater = self.discrete_filter_updater elif updater_type == 'kalman': self.updater = self.kalman_filter_updater elif updater_type == 'particle': self.updater = self.particle_filter_updater else: raise AttributeError( 'updater_type {} did not conform to expected type'.format( updater_type)) def discrete_filter_updater(self, belief, action, observation): def _compute_normalization_factor(bel): return sum(bel.values()) def _update_belief_for_state(b, sp, T, O, a, z): return O[sp][z] * sum([T[s][a][sp] * b[s] for s in b]) new_belief = defaultdict() for sprime in belief: new_belief[sprime] = _update_belief_for_state( belief, sprime, self.transition_probs, self.observation_probs, action, observation) normalization = _compute_normalization_factor(new_belief) for sprime in belief: if normalization > 0: new_belief[sprime] /= normalization return new_belief def kalman_filter_updater(self, belief, action, observation): pass def particle_filter_updater(self, belief, action, observation): pass def construct_transition_matrix(self, transition_func): ''' Create an MLE of the transition probabilities by sampling from the transition_func multiple times. Args: transition_func: T(s, a) -> s' Returns: transition_probabilities (defaultdict): T(s, a, s') --> float ''' self.vi._compute_matrix_from_trans_func() return self.vi.trans_dict def construct_observation_matrix(self, observation_func, transition_func): ''' Create an MLE of the observation probabilities by sampling from the observation_func multiple times. Args: observation_func: O(s) -> z transition_func: T(s, a) -> s' Returns: observation_probabilities (defaultdict): O(s, z) --> float ''' def normalize_probabilities(odict): norm_factor = sum(odict.values()) for obs in odict: odict[obs] /= norm_factor return odict obs_dict = defaultdict(lambda: defaultdict(float)) for state in self.vi.get_states(): for action in self.vi.mdp.actions: for sample in range(self.vi.sample_rate): observation = observation_func(state, action) next_state = transition_func(state, action) obs_dict[next_state][ observation] += 1. / self.vi.sample_rate for state in self.vi.get_states(): obs_dict[state] = normalize_probabilities(obs_dict[state]) return obs_dict
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print "\tRunning VI...", sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print " done." print "\tMaking state abstraction...", sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class) clusters = defaultdict(set) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x].add(state_x) for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func( state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].add(state_y) clusters[state_y].add(state_x) print "making clusters...", sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) print " done." print "\tGround States:", num_states print "\tAbstract:", sa.get_num_abstr_states() print return sa
def make_singletask_sa(mdp, indic_func, state_class, epsilon=0.0, aa_single_act=False, prob_of_mdp=1.0, track_act_opt_pr=False): ''' Args: mdp (MDP) indic_func (S x S --> {0,1}) state_class (Class) epsilon (float) Returns: (StateAbstraction) ''' print("\tRunning VI...",) sys.stdout.flush() # Run VI if isinstance(mdp, MDPDistribution): mdp = mdp.sample() vi = ValueIteration(mdp) iters, val = vi.run_vi() print(" done.") print("\tMaking state abstraction...",) sys.stdout.flush() sa = StateAbstraction(phi={}, state_class=state_class, track_act_opt_pr=track_act_opt_pr) clusters = defaultdict(list) num_states = len(vi.get_states()) actions = mdp.get_actions() # Find state pairs that satisfy the condition. for i, state_x in enumerate(vi.get_states()): sys.stdout.flush() clusters[state_x] = [state_x] for state_y in vi.get_states()[i:]: if not (state_x == state_y) and indic_func(state_x, state_y, vi, actions, epsilon=epsilon): clusters[state_x].append(state_y) clusters[state_y].append(state_x) print("making clusters...",) sys.stdout.flush() # Build SA. for i, state in enumerate(clusters.keys()): new_cluster = clusters[state] sa.make_cluster(new_cluster) # Destroy old so we don't double up. for s in clusters[state]: if s in clusters.keys(): clusters.pop(s) if aa_single_act: # Put all optimal actions in a set associated with the ground state. for ground_s in sa.get_ground_states(): a_star_set = set(vi.get_max_q_actions(ground_s)) sa.set_actions_state_opt_dict(ground_s, a_star_set, prob_of_mdp) print(" done.") print("\tGround States:", num_states) print("\tAbstract:", sa.get_num_abstr_states()) print() return sa