示例#1
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
def get_trivial_policy(
    mdp_obj: mdp.FiniteMarkovDecisionProcess[S, A]
) -> mdp.FinitePolicy[S, A]:
    """Generate a policy which randomly selects actions for each state.

    :param mdp_obj: Markov decision process for which to get uniform policy.
    :returns: Policy which assigns a uniform distribution to each action for
        each state.
    """
    state_action_dict: Dict[S, A] = {}

    for state in mdp_obj.states():
        actions = list(mdp_obj.actions(state))

        if len(actions) > 0:
            num_actions = len(actions)
            uniform_prob = 1/num_actions
            uniform_actions = dist.Categorical(
                {action: uniform_prob for action in actions}
            )
            state_action_dict[state] = uniform_actions
        else:
            state_action_dict[state] = None

    return mdp.FinitePolicy(state_action_dict)
def get_random_policy(
        mdp_obj: mdp.FiniteMarkovDecisionProcess[S,
                                                 A]) -> mdp.FinitePolicy[S, A]:
    """Generate a random policy for an MDP by uniform sampling of action space.

    This function is used to initialize the policy during MC Control.

    :param mdp_obj: MDP object for which random policy is being generated
    :returns: Random deterministic policy for MDP
    """
    state_action_dict: Dict[S, A] = {}

    for state in mdp_obj.states():
        actions = list(mdp_obj.actions(state))

        if len(actions) > 0:
            num_actions = len(actions)
            uniform_prob = 1 / num_actions
            uniform_actions = dist.Categorical(
                {action: uniform_prob
                 for action in actions})
            random_action = uniform_actions.sample()
            state_action_dict[state] = dist.Constant(random_action)
        else:
            state_action_dict[state] = None

    return mdp.FinitePolicy(state_action_dict)
示例#4
0
    def get_determ_policies(self) ->\
        List[mdp.FinitePolicy[FrogState, Croak]]:
        """Get all deterministic policies associated with the MDP.

        Recursively generate all deterministic policies for the `Frog Escape`
        MDP, and return these finite deterministic policies in a list. Because
        we are assuming deterministic policies, actions are selected with
        constant probabilities at each state for each policy.

        :returns: List of all possible mappings of states to deterministic
            policies
        """
        policy_combos: List[List[bool]] = [None] * (2**self.river.n_lily)
        n_combos = [0]
        actions = [None] * self.river.n_lily

        def add_to_policy(actions: List[Optional[bool]], position: int = 0):
            """Get combinations of croak_A settings in deterministic policies.

            :param actions: List of actions in current deterministic policy
            :param position: Current lily pad position when recursively forming
                deterministic policies (default = 0)
            """
            if position == self.river.n_lily:
                policy_combos[n_combos[0]] = actions.copy()
                n_combos[0] += 1
                return
            else:
                for action in (True, False):
                    actions[position] = action
                    add_to_policy(actions, position + 1)

        add_to_policy(actions)
        return [
            mdp.FinitePolicy(
                {FrogState(i) : Constant(policy[i]) \
                    for i in range(1, self.river.n_lily)}
            ) for policy in policy_combos
        ]