def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[bool, bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.states() for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1, ) uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({ s: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.states() }) transitions: Iterable[mdp.TransitionStep[ bool, bool]] = self.finite_mdp.simulate_actions( Choose(self.finite_mdp.states()), uniform_policy) qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99) q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last( cast(Iterator[Tabular[Tuple[bool, bool]]], itertools.islice(qs, 20000))) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [True, False]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def get_trivial_policy( mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A] ) -> mdp.FinitePolicy[S, A]: """Generate a policy which randomly selects actions for each state. :param mdp_obj: Markov decision process for which to get uniform policy. :returns: Policy which assigns a uniform distribution to each action for each state. """ state_action_dict: Dict[S, A] = {} for state in mdp_obj.states(): actions = list(mdp_obj.actions(state)) if len(actions) > 0: num_actions = len(actions) uniform_prob = 1/num_actions uniform_actions = dist.Categorical( {action: uniform_prob for action in actions} ) state_action_dict[state] = uniform_actions else: state_action_dict[state] = None return mdp.FinitePolicy(state_action_dict)
def get_random_policy( mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A]) -> mdp.FinitePolicy[S, A]: """Generate a random policy for an MDP by uniform sampling of action space. This function is used to initialize the policy during MC Control. :param mdp_obj: MDP object for which random policy is being generated :returns: Random deterministic policy for MDP """ state_action_dict: Dict[S, A] = {} for state in mdp_obj.states(): actions = list(mdp_obj.actions(state)) if len(actions) > 0: num_actions = len(actions) uniform_prob = 1 / num_actions uniform_actions = dist.Categorical( {action: uniform_prob for action in actions}) random_action = uniform_actions.sample() state_action_dict[state] = dist.Constant(random_action) else: state_action_dict[state] = None return mdp.FinitePolicy(state_action_dict)
def get_determ_policies(self) ->\ List[mdp.FinitePolicy[FrogState, Croak]]: """Get all deterministic policies associated with the MDP. Recursively generate all deterministic policies for the `Frog Escape` MDP, and return these finite deterministic policies in a list. Because we are assuming deterministic policies, actions are selected with constant probabilities at each state for each policy. :returns: List of all possible mappings of states to deterministic policies """ policy_combos: List[List[bool]] = [None] * (2**self.river.n_lily) n_combos = [0] actions = [None] * self.river.n_lily def add_to_policy(actions: List[Optional[bool]], position: int = 0): """Get combinations of croak_A settings in deterministic policies. :param actions: List of actions in current deterministic policy :param position: Current lily pad position when recursively forming deterministic policies (default = 0) """ if position == self.river.n_lily: policy_combos[n_combos[0]] = actions.copy() n_combos[0] += 1 return else: for action in (True, False): actions[position] = action add_to_policy(actions, position + 1) add_to_policy(actions) return [ mdp.FinitePolicy( {FrogState(i) : Constant(policy[i]) \ for i in range(1, self.river.n_lily)} ) for policy in policy_combos ]