def __init__(self, gamma, kappa=0.001): MDP.__init__(self, BadChainMDP.ACTIONS, self._transition_func, self._reward_func, init_state=ChainState(1), gamma=gamma) self.num_states = 4 self.kappa = kappa
def make_option_policy(mdp, init_states, goal_states): ''' Args: mdp init_states goal_states Returns: (lambda) ''' def goal_new_trans_func(s, a): original = s.is_terminal() s.set_terminal(s in goal_states) # or original) s_prime = mdp.get_transition_func()(s, a) s_prime.set_terminal(s_prime in goal_states) s.set_terminal(original) return s_prime # Make a new MDP. mini_mdp = MDP(actions=mdp.get_actions(), init_state=mdp.get_init_state(), transition_func=goal_new_trans_func, reward_func=lambda x,y,z : -1) o_policy, _ = _make_mini_mdp_option_policy(mini_mdp) return o_policy
def compute_avg_mdp(mdp_distr, sample_rate=5): ''' Args: mdp_distr (defaultdict) Returns: (MDP) ''' # Get normal components. init_state = mdp_distr.get_init_state() actions = mdp_distr.get_actions() gamma = mdp_distr.get_gamma() T = mdp_distr.get_all_mdps()[0].get_transition_func() # Compute avg reward. avg_rew = defaultdict(lambda: defaultdict(float)) avg_trans_counts = defaultdict(lambda: defaultdict(lambda: defaultdict( float))) # Stores T_i(s,a,s') * Pr(M_i) for mdp in mdp_distr.get_mdps(): prob_of_mdp = mdp_distr.get_prob_of_mdp(mdp) # Get a vi instance to compute state space. vi = ValueIteration(mdp, delta=0.0001, max_iterations=2000, sample_rate=sample_rate) iters, value = vi.run_vi() states = vi.get_states() for s in states: for a in actions: r = mdp.reward_func(s, a) avg_rew[s][a] += prob_of_mdp * r for repeat in range(sample_rate): s_prime = mdp.transition_func(s, a) avg_trans_counts[s][a][s_prime] += prob_of_mdp avg_trans_probs = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) for s in avg_trans_counts.keys(): for a in actions: for s_prime in avg_trans_counts[s][a].keys(): avg_trans_probs[s][a][s_prime] = avg_trans_counts[s][a][ s_prime] / sum(avg_trans_counts[s][a].values()) def avg_rew_func(s, a): return avg_rew[s][a] avg_trans_func = T avg_mdp = MDP(actions, avg_trans_func, avg_rew_func, init_state, gamma) return avg_mdp
def make_abstr_mdp(mdp, state_abstr, action_abstr=None, step_cost=0.0, sample_rate=5, max_rollout=10): ''' Args: mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) step_cost (float): Cost for a step in the lower MDP. sample_rate (int): Sample rate for computing the abstract R and T. Returns: (MDP) ''' if action_abstr is None: action_abstr = ActionAbstraction(prim_actions=mdp.get_actions()) # Make abstract reward and transition functions. def abstr_reward_lambda(abstr_state, abstr_action): if abstr_state.is_terminal(): return 0 # Get relevant MDP components from the lower MDP. lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute reward. total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout( ground_s, lower_reward_func, lower_trans_func, max_rollout_depth=max_rollout, step_cost=step_cost) total_reward += float(reward) / ( len(lower_states) * sample_rate) # Add weighted reward. return total_reward def abstr_transition_lambda(abstr_state, abstr_action): is_ground_terminal = False for s_g in state_abstr.get_lower_states_in_abs_state(abstr_state): if s_g.is_terminal(): is_ground_terminal = True break # Get relevant MDP components from the lower MDP. if abstr_state.is_terminal(): return abstr_state lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute next state distribution. s_prime_prob_dict = defaultdict(int) total_reward = 0 for ground_s in lower_states: for sample in range(sample_rate): s_prime, reward = abstr_action.rollout( ground_s, lower_reward_func, lower_trans_func, max_rollout_depth=max_rollout) s_prime_prob_dict[s_prime] += ( 1.0 / (len(lower_states) * sample_rate) ) # Weighted average. # Form distribution and sample s_prime. next_state_sample_list = list( np.random.multinomial(1, list(s_prime_prob_dict.values())).tolist()) end_ground_state = list( s_prime_prob_dict.keys())[next_state_sample_list.index(1)] end_abstr_state = state_abstr.phi(end_ground_state) return end_abstr_state # Make the components of the Abstract MDP. abstr_init_state = state_abstr.phi(mdp.get_init_state()) abstr_action_space = action_abstr.get_actions() abstr_state_space = state_abstr.get_abs_states() abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space, abstr_action_space) abstr_transition_func = TransitionFunc(abstr_transition_lambda, abstr_state_space, abstr_action_space, sample_rate=sample_rate) # Make the MDP. abstr_mdp = MDP(actions=abstr_action_space, init_state=abstr_init_state, reward_func=abstr_reward_func.reward_func, transition_func=abstr_transition_func.transition_func, gamma=mdp.get_gamma()) return abstr_mdp
def make_abstr_mdp(mdp, state_abstr, action_abstr, sample_rate=25): ''' Args: mdp (MDP) state_abstr (StateAbstraction) action_abstr (ActionAbstraction) sample_rate (int): Sample rate for computing the abstract R and T. Returns: (MDP) ''' # Grab ground state space. vi = ValueIteration(mdp) state_space = vi.get_states() # Make abstract reward and transition functions. def abstr_reward_lambda(abstr_state, abstr_action): # Get relevant MDP components from the lower MDP. lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute reward. total_reward = 0 for ground_s in lower_states: for sample in xrange(sample_rate): s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func) total_reward += float(reward) / ( len(lower_states) * sample_rate) # Add weighted reward. # print "~"*20 # print "R_A:", abstr_state, abstr_action, total_reward # print "~"*20 return total_reward def abstr_transition_lambda(abstr_state, abstr_action): # print "Abstr Transition Func:" # print "\t abstr_state:", abstr_state # Get relevant MDP components from the lower MDP. lower_states = state_abstr.get_lower_states_in_abs_state(abstr_state) lower_reward_func = mdp.get_reward_func() lower_trans_func = mdp.get_transition_func() # Compute next state distribution. s_prime_prob_dict = defaultdict(int) total_reward = 0 for ground_s in lower_states: for sample in xrange(sample_rate): s_prime, reward = abstr_action.rollout(ground_s, lower_reward_func, lower_trans_func) s_prime_prob_dict[s_prime] += ( 1.0 / (len(lower_states) * sample_rate) ) # Weighted average. # Form distribution and sample s_prime. end_ground_state = s_prime_prob_dict.keys()[list( np.random.multinomial( 1, s_prime_prob_dict.values()).tolist()).index(1)] end_abstr_state = state_abstr.phi(end_ground_state, level=abstr_state.get_level()) return end_abstr_state # Make the components of the MDP. abstr_init_state = state_abstr.phi(mdp.get_init_state()) abstr_action_space = action_abstr.get_actions() abstr_state_space = state_abstr.get_abs_states() abstr_reward_func = RewardFunc(abstr_reward_lambda, abstr_state_space, abstr_action_space) abstr_transition_func = TransitionFunc(abstr_transition_lambda, abstr_state_space, abstr_action_space, sample_rate=sample_rate) # Make the MDP. abstr_mdp = MDP(actions=abstr_action_space, init_state=abstr_init_state, reward_func=abstr_reward_func.reward_func, transition_func=abstr_transition_func.transition_func, gamma=0.5) return abstr_mdp