def get_opt_vf_and_policy(self) -> \
         Iterator[Tuple[V[int], FinitePolicy[int, bool]]]:
     dt: float = self.dt()
     up_factor: float = np.exp(self.vol * np.sqrt(dt))
     up_prob: float = (np.exp(self.rate * dt) * up_factor - 1) / \
         (up_factor * up_factor - 1)
     return optimal_vf_and_policy(
         steps=[
             {j: None if j == -1 else {
                 True: Constant(
                     (
                         -1,
                         self.payoff(i * dt, self.state_price(i, j))
                     )
                 ),
                 False: Categorical(
                     {
                         (j + 1, 0.): up_prob,
                         (j, 0.): 1 - up_prob
                     }
                 )
             } for j in range(i + 1)}
             for i in range(self.num_steps + 1)
         ],
         gamma=np.exp(-self.rate * dt)
     )
Пример #2
0
    def test_value_iteration(self):
        vpstar = optimal_vf_and_policy(self.mdp_seq, 1.)
        states = self.single_step_mdp.states()
        fa_dynamic = Dynamic({s: 0.0 for s in states})
        fa_tabular = Tabular()
        distribution = Choose(set(states))
        approx_vpstar_finite = back_opt_vf_and_policy_finite(
            [(self.mdp_seq[i], fa_dynamic) for i in range(self.steps)],
            1.
        )
        approx_vpstar = back_opt_vf_and_policy(
            [(self.single_step_mdp, fa_tabular, distribution)
             for _ in range(self.steps)],
            1.,
            num_state_samples=120,
            error_tolerance=0.01
        )

        for t, ((v1, _), (v2, _), (v3, _)) in enumerate(zip(
                vpstar,
                approx_vpstar_finite,
                approx_vpstar
        )):
            states = self.mdp_seq[t].keys()
            v1_arr = np.array([v1[s] for s in states])
            v2_arr = v2.evaluate(states)
            v3_arr = v3.evaluate(states)
            self.assertLess(max(abs(v1_arr - v2_arr)), 0.001)
            self.assertLess(max(abs(v1_arr - v3_arr)), 1.0)
Пример #3
0
    def test_optimal_policy(self):
        finite = finite_horizon_MDP(self.finite_flip_flop, limit=10)
        steps = unwrap_finite_horizon_MDP(finite)
        *v_ps, (_, p) = optimal_vf_and_policy(steps, gamma=1)

        for _, a in p.action_for.items():
            self.assertEqual(a, False)

        self.assertAlmostEqual(v_ps[0][0][NonTerminal(True)], 17)
        self.assertAlmostEqual(v_ps[5][0][NonTerminal(False)], 17 / 2)
Пример #4
0
    def test_optimal_policy(self):
        finite = finite_horizon_MDP(self.finite_flip_flop, limit=10)
        steps = unwrap_finite_horizon_MDP(finite)
        *v_ps, (v, p) = optimal_vf_and_policy(steps, gamma=1)

        for s in p.states():
            self.assertEqual(p.act(s), Constant(False))

        self.assertAlmostEqual(v_ps[0][0][True], 17)
        self.assertAlmostEqual(v_ps[5][0][False], 17 / 2)
Пример #5
0
 def get_optimal_vf_and_policy(self)\
         -> Iterator[Tuple[V[int], FinitePolicy[int, int]]]:
     return optimal_vf_and_policy(unwrap_finite_horizon_MDP(self.mdp), 1.)