예제 #1
0
def unwrap_finite_horizon_MDP(
    process: FiniteMarkovDecisionProcess[WithTime[S], A]
) -> Sequence[StateActionMapping[S, A]]:
    '''Unwrap a finite Markov decision process into a sequence of
    transitions between each time step (starting with 0). This
    representation makes it easier to implement backwards induction.

    '''
    def time(x: WithTime[S]) -> int:
        return x.time

    def single_without_time(
            s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]:
        if isinstance(s_r[0], NonTerminal):
            ret: Tuple[State[S],
                       float] = (NonTerminal(s_r[0].state.state), s_r[1])
        else:
            ret = (Terminal(s_r[0].state.state), s_r[1])
        return ret

    def without_time(arg: ActionMapping[A, WithTime[S]]) -> \
            ActionMapping[A, S]:
        return {
            a: sr_distr.map(single_without_time)
            for a, sr_distr in arg.items()
        }

    return [{
        NonTerminal(s.state):
        without_time(process.action_mapping(NonTerminal(s)))
        for s in states
    } for _, states in groupby(sorted(
        (nt.state for nt in process.non_terminal_states), key=time),
                               key=time)]
예제 #2
0
def unwrap_finite_horizon_MRP(
    process: FiniteMarkovRewardProcess[WithTime[S]]
) -> Sequence[RewardTransition[S]]:
    '''Given a finite-horizon process, break the transition between each
    time step (starting with 0) into its own data structure. This
    representation makes it easier to implement backwards
    induction.

    '''
    def time(x: WithTime[S]) -> int:
        return x.time

    def single_without_time(
            s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]:
        if isinstance(s_r[0], NonTerminal):
            ret: Tuple[State[S],
                       float] = (NonTerminal(s_r[0].state.state), s_r[1])
        else:
            ret = (Terminal(s_r[0].state.state), s_r[1])
        return ret

    def without_time(arg: StateReward[WithTime[S]]) -> StateReward[S]:
        return arg.map(single_without_time)

    return [{
        NonTerminal(s.state):
        without_time(process.transition_reward(NonTerminal(s)))
        for s in states
    } for _, states in groupby(sorted(
        (nt.state for nt in process.non_terminal_states), key=time),
                               key=time)]
예제 #3
0
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0 for s in self.finite_flip_flop.non_terminal_states},
            count_to_weight_func=lambda _: 0.1
        )

        episode_length = 20
        episodes: Iterable[Iterable[mp.TransitionStep[bool]]] =\
            self.finite_flip_flop.reward_traces(Choose({
                NonTerminal(True),
                NonTerminal(False)
            }))
        transitions: Iterable[mp.TransitionStep[bool]] =\
            itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes
            )

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[NonTerminal[bool]]] = iterate.last(
            itertools.islice(
                cast(Iterator[Tabular[NonTerminal[bool]]], vs),
                10000)
        )

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
예제 #4
0
def get_fixed_episodes_from_sr_pairs_seq(
        sr_pairs_seq: Sequence[Sequence[Tuple[S, float]]],
        terminal_state: S) -> Sequence[Sequence[TransitionStep[S]]]:
    return [[
        TransitionStep(state=NonTerminal(s),
                       reward=r,
                       next_state=NonTerminal(trace[i + 1][0])
                       if i < len(trace) - 1 else Terminal(terminal_state))
        for i, (s, r) in enumerate(trace)
    ] for trace in sr_pairs_seq]
예제 #5
0
        def next_state(state=state):
            switch_states = Bernoulli(self.p).sample()

            st: bool = state.state
            if switch_states:
                next_s: bool = not st
                reward = 1 if st else 0.5
                return NonTerminal(next_s), reward
            else:
                return NonTerminal(st), 0.5
예제 #6
0
    def test_compare_to_backward_induction(self):
        finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10)

        v = evaluate_mrp_result(finite_horizon, gamma=1)
        self.assertEqual(len(v), 20)

        finite_v =\
            list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1))

        for time in range(10):
            self.assertAlmostEqual(
                v[NonTerminal(WithTime(state=True, time=time))],
                finite_v[time][NonTerminal(True)])
            self.assertAlmostEqual(
                v[NonTerminal(WithTime(state=False, time=time))],
                finite_v[time][NonTerminal(False)])
예제 #7
0
    def test_flip_flop(self):
        trace = list(
            itertools.islice(
                self.flip_flop.simulate(Constant(NonTerminal(True))), 10))

        self.assertTrue(
            all(isinstance(outcome.state, bool) for outcome in trace))

        longer_trace = itertools.islice(
            self.flip_flop.simulate(Constant(NonTerminal(True))), 10000)
        count_trues = len(
            list(outcome for outcome in longer_trace if outcome.state))

        # If the code is correct, this should fail with a vanishingly
        # small probability
        self.assertTrue(1000 < count_trues < 9000)
예제 #8
0
    def get_q_learning_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Set[Move]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01,
        epsilon: float = 0.1
    ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-block cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items()}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            state: Cell = uniform_states.sample()
            '''
            write your code here
            update the dictionary q initialized above according
            to the Q-learning algorithm's Q-Value Function updates.
            '''

        vf_dict: V[Cell] = {NonTerminal(s): max(d.values()) for s, d
                            in q.items()}
        policy: FiniteDeterministicPolicy[Cell, Move] = \
            FiniteDeterministicPolicy(
                {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()}
            )
        return (vf_dict, policy)
예제 #9
0
    def test_evaluate_finite_mrp(self):
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.non_terminal_states})
        traces = self.finite_flip_flop.reward_traces(
            Choose({NonTerminal(True), NonTerminal(False)}))
        v = iterate.converged(
            mc.mc_prediction(traces, γ=0.99, approx_0=start),
            # Loose bound of 0.01 to speed up test.
            done=lambda a, b: a.within(b, 0.01))

        self.assertEqual(len(v.values_map), 2)

        for s in v.values_map:
            # Intentionally loose bound—otherwise test is too slow.
            # Takes >1s on my machine otherwise.
            self.assertLess(abs(v(s) - 170), 1.0)
예제 #10
0
 def single_without_time(
         s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]:
     if isinstance(s_r[0], NonTerminal):
         ret: Tuple[State[S],
                    float] = (NonTerminal(s_r[0].state.state), s_r[1])
     else:
         ret = (Terminal(s_r[0].state.state), s_r[1])
     return ret
예제 #11
0
 def sample_next_state_reward(state=state) ->\
         Tuple[State[InventoryState], float]:
     demand_sample: int = np.random.poisson(self.poisson_lambda)
     ip: int = state.state.inventory_position()
     next_state: InventoryState = InventoryState(
         max(ip - demand_sample, 0), max(self.capacity - ip, 0))
     reward: float = - self.holding_cost * state.on_hand\
         - self.stockout_cost * max(demand_sample - ip, 0)
     return NonTerminal(next_state), reward
예제 #12
0
 def __init__(self,
              mapping: Mapping[S,
                               Mapping[A,
                                       FiniteDistribution[Tuple[S,
                                                                float]]]]):
     non_terminals: Set[S] = set(mapping.keys())
     self.mapping = {
         NonTerminal(s): {
             a: Categorical({
                 (NonTerminal(s1) if s1 in non_terminals else Terminal(s1),
                  r): p
                 for (s1, r), p in v.table().items()
             })
             for a, v in d.items()
         }
         for s, d in mapping.items()
     }
     self.non_terminal_states = list(self.mapping.keys())
예제 #13
0
    def test_flip_flop(self):
        trace = list(
            itertools.islice(
                self.flip_flop.simulate_reward(Constant(NonTerminal(True))),
                10))

        self.assertTrue(
            all(isinstance(step.next_state.state, bool) for step in trace))

        cumulative_reward = sum(step.reward for step in trace)
        self.assertTrue(0 <= cumulative_reward <= 10)
예제 #14
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.non_terminal_states
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1
        )

        uniform_policy: FinitePolicy[bool, bool] =\
            FinitePolicy({
                s.state: Choose(self.finite_mdp.actions(s))
                for s in self.finite_mdp.non_terminal_states
            })

        transitions: Iterable[mdp.TransitionStep[bool, bool]] =\
            self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.non_terminal_states),
                uniform_policy
            )

        qs = td.q_learning_external_transitions(
            transitions,
            self.finite_mdp.actions,
            q_0,
            γ=0.99
        )

        q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\
            iterate.last(
                cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]],
                     itertools.islice(qs, 20000))
            )

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [NonTerminal(True), NonTerminal(False)]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
예제 #15
0
def get_mean_returns_from_return_steps(
        returns_seq: Sequence[ReturnStep[S]]
) -> Mapping[NonTerminal[S], float]:
    def by_state(ret: ReturnStep[S]) -> S:
        return ret.state.state

    sorted_returns_seq: Sequence[ReturnStep[S]] = sorted(returns_seq,
                                                         key=by_state)
    return {
        NonTerminal(s): np.mean([r.return_ for r in l])
        for s, l in itertools.groupby(sorted_returns_seq, key=by_state)
    }
예제 #16
0
    def fraction_of_days_oos(self, policy: Policy[InventoryState, int],
                             time_steps: int, num_traces: int) -> float:
        impl_mrp: MarkovRewardProcess[InventoryState] =\
            self.apply_policy(policy)
        count: int = 0
        high_fractile: int = int(poisson(self.poisson_lambda).ppf(0.98))
        start: InventoryState = random.choice(
            [InventoryState(i, 0) for i in range(high_fractile + 1)])

        for _ in range(num_traces):
            steps = itertools.islice(
                impl_mrp.simulate_reward(Constant(NonTerminal(start))),
                time_steps)
            for step in steps:
                if step.reward < -self.holding_cost * step.state.state.on_hand:
                    count += 1

        return float(count) / (time_steps * num_traces)
예제 #17
0
    def get_sarsa_vf_and_policy(
        self,
        states_actions_dict: Mapping[Cell, Set[Move]],
        sample_func: Callable[[Cell, Move], Tuple[Cell, float]],
        episodes: int = 10000,
        step_size: float = 0.01
    ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
        '''
        states_actions_dict gives us the set of possible moves from
        a non-terminal cell.
        sample_func is a function with two inputs: state and action,
        and with output as a sampled pair of (next_state, reward).
        '''
        q: Dict[Cell, Dict[Move, float]] = \
            {s: {a: 0. for a in actions} for s, actions in
             states_actions_dict.items()}
        nt_states: CellSet = {s for s in q}
        uniform_states: Choose[Cell] = Choose(nt_states)
        for episode_num in range(episodes):
            epsilon: float = 1.0 / (episode_num + 1)
            state: Cell = uniform_states.sample()
            action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon)
            while state in nt_states:
                next_state, reward = sample_func(state, action)
                if next_state in nt_states:
                    next_action: Move = WindyGrid.epsilon_greedy_action(
                        next_state, q, epsilon)
                    q[state][action] += step_size * \
                        (reward + q[next_state][next_action] -
                         q[state][action])
                    action = next_action
                else:
                    q[state][action] += step_size * (reward - q[state][action])
                state = next_state

        vf_dict: V[Cell] = {
            NonTerminal(s): max(d.values())
            for s, d in q.items()
        }
        policy: FiniteDeterministicPolicy[Cell, Move] = \
            FiniteDeterministicPolicy(
                {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()}
            )
        return vf_dict, policy
예제 #18
0
    european_price: float = european_put_price(spot_price=spot_price_val,
                                               expiry=expiry_val,
                                               rate=rate_val,
                                               vol=vol_val,
                                               strike=strike_val)

    opt_ex_bin_tree: OptimalExerciseBinTree = OptimalExerciseBinTree(
        spot_price=spot_price_val,
        payoff=lambda _, x: max(strike_val - x, 0),
        expiry=expiry_val,
        rate=rate_val,
        vol=vol_val,
        num_steps=100)

    vf_seq, policy_seq = zip(*opt_ex_bin_tree.get_opt_vf_and_policy())
    bin_tree_price: float = vf_seq[0][NonTerminal(0)]
    bin_tree_ex_boundary: Sequence[Tuple[float, float]] = \
        opt_ex_bin_tree.option_exercise_boundary(policy_seq, False)
    bin_tree_x, bin_tree_y = zip(*bin_tree_ex_boundary)

    lspi_x, lspi_y = put_option_exercise_boundary(func=flspi,
                                                  expiry=expiry_val,
                                                  num_steps=num_steps_lspi,
                                                  strike=strike_val)
    dql_x, dql_y = put_option_exercise_boundary(func=fdql,
                                                expiry=expiry_val,
                                                num_steps=num_steps_dql,
                                                strike=strike_val)
    plot_list_of_curves(list_of_x_vals=[lspi_x, dql_x, bin_tree_x],
                        list_of_y_vals=[lspi_y, dql_y, bin_tree_y],
                        list_of_colors=["b", "r", "g"],
예제 #19
0
 def sample_func(state: Cell, action: Move) -> Tuple[Cell, float]:
     s, r = mdp.step(NonTerminal(state), action).sample()
     return s.state, r
예제 #20
0
파일: vampire.py 프로젝트: shenoy1/RL-book
if __name__ == '__main__':
    from pprint import pprint
    from rl.gen_utils.plot_funcs import plot_list_of_curves
    from rl.markov_process import NonTerminal

    villagers: int = 20
    vampire_mdp: VampireMDP = VampireMDP(villagers)
    true_vf, true_policy = vampire_mdp.vi_vf_and_policy()
    pprint(true_vf)
    print(true_policy)
    lspi_vf, lspi_policy = vampire_mdp.lspi_vf_and_policy()
    pprint(lspi_vf)
    print(lspi_policy)

    states = range(1, villagers + 1)
    true_vf_vals = [true_vf[NonTerminal(s)] for s in states]
    lspi_vf_vals = [lspi_vf[NonTerminal(s)] for s in states]
    true_policy_actions = [true_policy.action_for[s] for s in states]
    lspi_policy_actions = [lspi_policy.action_for[s] for s in states]

    plot_list_of_curves(
        [states, states], [true_vf_vals, lspi_vf_vals], ["r", "b"],
        ["True Optimal VF", "LSPI-Estimated Optimal VF"],
        x_label="States",
        y_label="Optimal Values",
        title="True Optimal VF versus LSPI-Estimated Optimal VF")
    plot_list_of_curves(
        [states, states], [true_policy_actions, lspi_policy_actions],
        ["r", "b"], ["True Optimal Policy", "LSPI-Estimated Optimal Policy"],
        x_label="States",
        y_label="Optimal Actions",
 def deter_policy(state: S) -> A:
     return max(
         ((mdp.step(NonTerminal(state), a).expectation(return_), a)
          for a in mdp.actions(NonTerminal(state))),
         key=itemgetter(0))[1]
 def deter_policy(state: S) -> A:
     return max(((res.expectation(return_), a)
                 for a, res in step[NonTerminal(state)].items()),
                key=itemgetter(0))[1]
예제 #23
0
import itertools
import rl.iterate as iterate

given_data: Sequence[Sequence[Tuple[str, float]]] = [
    [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
    [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
    [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
    [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
    [('B', 8.), ('B', 2.)]
]

gamma: float = 0.9

fixed_traces: Sequence[Sequence[TransitionStep[str]]] = \
    [[TransitionStep(
        state=NonTerminal(s),
        reward=r,
        next_state=NonTerminal(trace[i+1][0])
        if i < len(trace) - 1 else Terminal('T')
    ) for i, (s, r) in enumerate(trace)] for trace in given_data]

a: NonTerminal[str] = NonTerminal('A')
b: NonTerminal[str] = NonTerminal('B')

# fa: LinearFunctionApprox[NonTerminal[str]] = LinearFunctionApprox.create(
#     feature_functions=[
#         lambda x: 1.0 if x == a else 0.,
#         lambda y: 1.0 if y == b else 0.
#     ],
#     adam_gradient=AdamGradient(
#         learning_rate=0.1,
예제 #24
0
 def next_state(state=state):
     switch_states = Bernoulli(self.p).sample()
     next_st: bool = not state.state if switch_states else state.state
     return NonTerminal(next_st)