Пример #1
0
def mc_control_scratch(
        #traces: Iterable[Iterable[mp.TransitionStep[S]]],
        mdp_to_sample: FiniteMarkovDecisionProcess,
        states: List[S],
        actions: Mapping[S, List[A]],
        γ: float,
        tolerance: float = 1e-6,
        num_episodes: float = 10000) -> Mapping[Tuple[S, A], float]:

    q: Mapping[Tuple[S, A], float] = {}
    counts_per_state_act: Mapping[Tuple[S, A], int] = {}
    for state in states:
        for action in actions[state]:
            q[(state, action)] = 0.
            counts_per_state_act[(state, action)] = 0
    policy_map: Mapping[S, Optional[Categorical[A]]] = {}
    for state in states:
        if actions[state] is None:
            policy_map[state] = None
        else:
            policy_map[state] = Categorical(
                {action: 1
                 for action in actions[state]})
    Pi: FinitePolicy[S, A] = FinitePolicy(policy_map)
    start_state_distrib = Categorical({state: 1 for state in states})
    for i in range(num_episodes):
        trace: Iterable[TransitionStep[S, A]] = mdp_to_sample.simulate_actions(
            start_state_distrib, Pi)
        episode = returns(trace, γ, tolerance)
        #print(episode)
        for step in episode:
            state = step.state
            action = step.action
            return_ = step.return_
            counts_per_state_act[(state, action)] += 1
            q[(state, action)] += 1 / counts_per_state_act[
                (state, action)] * (return_ - q[(state, action)])
        eps = 1 / (i + 1)
        new_pol: Mapping[S, Optional[Categorical[A]]] = {}
        for state in states:
            if actions[state] is None:
                new_pol[state] = None
            policy_map = {
                action: eps / len(actions[state])
                for action in actions[state]
            }
            best_action = actions[state][0]
            for action in actions[state]:
                if q[(state, best_action)] <= q[(state, action)]:
                    best_action = action
            policy_map[best_action] += 1 - eps
            new_pol[state] = Categorical(policy_map)
        Pi = FinitePolicy(new_pol)

    return q
Пример #2
0
class TestEvaluate(unittest.TestCase):
    def setUp(self):
        random.seed(42)

        self.finite_flip_flop = FlipFlop(0.7)

        self.finite_mdp = FiniteMarkovDecisionProcess({
            True: {
                True: Categorical({
                    (True, 1.0): 0.7,
                    (False, 2.0): 0.3
                }),
                False: Categorical({
                    (True, 1.0): 0.3,
                    (False, 2.0): 0.7
                }),
            },
            False: {
                True: Categorical({
                    (False, 1.0): 0.7,
                    (True, 2.0): 0.3
                }),
                False: Categorical({
                    (False, 1.0): 0.3,
                    (True, 2.0): 0.7
                }),
            },
        })

    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False

    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False