def unwrap_finite_horizon_MDP( process: FiniteMarkovDecisionProcess[WithTime[S], A] ) -> Sequence[StateActionMapping[S, A]]: '''Unwrap a finite Markov decision process into a sequence of transitions between each time step (starting with 0). This representation makes it easier to implement backwards induction. ''' def time(x: WithTime[S]) -> int: return x.time def single_without_time( s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]: if isinstance(s_r[0], NonTerminal): ret: Tuple[State[S], float] = (NonTerminal(s_r[0].state.state), s_r[1]) else: ret = (Terminal(s_r[0].state.state), s_r[1]) return ret def without_time(arg: ActionMapping[A, WithTime[S]]) -> \ ActionMapping[A, S]: return { a: sr_distr.map(single_without_time) for a, sr_distr in arg.items() } return [{ NonTerminal(s.state): without_time(process.action_mapping(NonTerminal(s))) for s in states } for _, states in groupby(sorted( (nt.state for nt in process.non_terminal_states), key=time), key=time)]
def unwrap_finite_horizon_MRP( process: FiniteMarkovRewardProcess[WithTime[S]] ) -> Sequence[RewardTransition[S]]: '''Given a finite-horizon process, break the transition between each time step (starting with 0) into its own data structure. This representation makes it easier to implement backwards induction. ''' def time(x: WithTime[S]) -> int: return x.time def single_without_time( s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]: if isinstance(s_r[0], NonTerminal): ret: Tuple[State[S], float] = (NonTerminal(s_r[0].state.state), s_r[1]) else: ret = (Terminal(s_r[0].state.state), s_r[1]) return ret def without_time(arg: StateReward[WithTime[S]]) -> StateReward[S]: return arg.map(single_without_time) return [{ NonTerminal(s.state): without_time(process.transition_reward(NonTerminal(s))) for s in states } for _, states in groupby(sorted( (nt.state for nt in process.non_terminal_states), key=time), key=time)]
def test_evaluate_finite_mrp(self) -> None: start = Tabular( {s: 0.0 for s in self.finite_flip_flop.non_terminal_states}, count_to_weight_func=lambda _: 0.1 ) episode_length = 20 episodes: Iterable[Iterable[mp.TransitionStep[bool]]] =\ self.finite_flip_flop.reward_traces(Choose({ NonTerminal(True), NonTerminal(False) })) transitions: Iterable[mp.TransitionStep[bool]] =\ itertools.chain.from_iterable( itertools.islice(episode, episode_length) for episode in episodes ) vs = td.td_prediction(transitions, γ=0.99, approx_0=start) v: Optional[Tabular[NonTerminal[bool]]] = iterate.last( itertools.islice( cast(Iterator[Tabular[NonTerminal[bool]]], vs), 10000) ) if v is not None: self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 3.0) else: assert False
def get_fixed_episodes_from_sr_pairs_seq( sr_pairs_seq: Sequence[Sequence[Tuple[S, float]]], terminal_state: S) -> Sequence[Sequence[TransitionStep[S]]]: return [[ TransitionStep(state=NonTerminal(s), reward=r, next_state=NonTerminal(trace[i + 1][0]) if i < len(trace) - 1 else Terminal(terminal_state)) for i, (s, r) in enumerate(trace) ] for trace in sr_pairs_seq]
def next_state(state=state): switch_states = Bernoulli(self.p).sample() st: bool = state.state if switch_states: next_s: bool = not st reward = 1 if st else 0.5 return NonTerminal(next_s), reward else: return NonTerminal(st), 0.5
def test_compare_to_backward_induction(self): finite_horizon = finite_horizon_MRP(self.finite_flip_flop, 10) v = evaluate_mrp_result(finite_horizon, gamma=1) self.assertEqual(len(v), 20) finite_v =\ list(evaluate(unwrap_finite_horizon_MRP(finite_horizon), gamma=1)) for time in range(10): self.assertAlmostEqual( v[NonTerminal(WithTime(state=True, time=time))], finite_v[time][NonTerminal(True)]) self.assertAlmostEqual( v[NonTerminal(WithTime(state=False, time=time))], finite_v[time][NonTerminal(False)])
def test_flip_flop(self): trace = list( itertools.islice( self.flip_flop.simulate(Constant(NonTerminal(True))), 10)) self.assertTrue( all(isinstance(outcome.state, bool) for outcome in trace)) longer_trace = itertools.islice( self.flip_flop.simulate(Constant(NonTerminal(True))), 10000) count_trues = len( list(outcome for outcome in longer_trace if outcome.state)) # If the code is correct, this should fail with a vanishingly # small probability self.assertTrue(1000 < count_trues < 9000)
def get_q_learning_vf_and_policy( self, states_actions_dict: Mapping[Cell, Set[Move]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01, epsilon: float = 0.1 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-block cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items()} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): state: Cell = uniform_states.sample() ''' write your code here update the dictionary q initialized above according to the Q-learning algorithm's Q-Value Function updates. ''' vf_dict: V[Cell] = {NonTerminal(s): max(d.values()) for s, d in q.items()} policy: FiniteDeterministicPolicy[Cell, Move] = \ FiniteDeterministicPolicy( {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()} ) return (vf_dict, policy)
def test_evaluate_finite_mrp(self): start = Tabular( {s: 0.0 for s in self.finite_flip_flop.non_terminal_states}) traces = self.finite_flip_flop.reward_traces( Choose({NonTerminal(True), NonTerminal(False)})) v = iterate.converged( mc.mc_prediction(traces, γ=0.99, approx_0=start), # Loose bound of 0.01 to speed up test. done=lambda a, b: a.within(b, 0.01)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
def single_without_time( s_r: Tuple[State[WithTime[S]], float]) -> Tuple[State[S], float]: if isinstance(s_r[0], NonTerminal): ret: Tuple[State[S], float] = (NonTerminal(s_r[0].state.state), s_r[1]) else: ret = (Terminal(s_r[0].state.state), s_r[1]) return ret
def sample_next_state_reward(state=state) ->\ Tuple[State[InventoryState], float]: demand_sample: int = np.random.poisson(self.poisson_lambda) ip: int = state.state.inventory_position() next_state: InventoryState = InventoryState( max(ip - demand_sample, 0), max(self.capacity - ip, 0)) reward: float = - self.holding_cost * state.on_hand\ - self.stockout_cost * max(demand_sample - ip, 0) return NonTerminal(next_state), reward
def __init__(self, mapping: Mapping[S, Mapping[A, FiniteDistribution[Tuple[S, float]]]]): non_terminals: Set[S] = set(mapping.keys()) self.mapping = { NonTerminal(s): { a: Categorical({ (NonTerminal(s1) if s1 in non_terminals else Terminal(s1), r): p for (s1, r), p in v.table().items() }) for a, v in d.items() } for s, d in mapping.items() } self.non_terminal_states = list(self.mapping.keys())
def test_flip_flop(self): trace = list( itertools.islice( self.flip_flop.simulate_reward(Constant(NonTerminal(True))), 10)) self.assertTrue( all(isinstance(step.next_state.state, bool) for step in trace)) cumulative_reward = sum(step.reward for step in trace) self.assertTrue(0 <= cumulative_reward <= 10)
def test_evaluate_finite_mdp(self) -> None: q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular( {(s, a): 0.0 for s in self.finite_mdp.non_terminal_states for a in self.finite_mdp.actions(s)}, count_to_weight_func=lambda _: 0.1 ) uniform_policy: FinitePolicy[bool, bool] =\ FinitePolicy({ s.state: Choose(self.finite_mdp.actions(s)) for s in self.finite_mdp.non_terminal_states }) transitions: Iterable[mdp.TransitionStep[bool, bool]] =\ self.finite_mdp.simulate_actions( Choose(self.finite_mdp.non_terminal_states), uniform_policy ) qs = td.q_learning_external_transitions( transitions, self.finite_mdp.actions, q_0, γ=0.99 ) q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\ iterate.last( cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]], itertools.islice(qs, 20000)) ) if q is not None: self.assertEqual(len(q.values_map), 4) for s in [NonTerminal(True), NonTerminal(False)]: self.assertLess(abs(q((s, False)) - 170.0), 2) self.assertGreater(q((s, False)), q((s, True))) else: assert False
def get_mean_returns_from_return_steps( returns_seq: Sequence[ReturnStep[S]] ) -> Mapping[NonTerminal[S], float]: def by_state(ret: ReturnStep[S]) -> S: return ret.state.state sorted_returns_seq: Sequence[ReturnStep[S]] = sorted(returns_seq, key=by_state) return { NonTerminal(s): np.mean([r.return_ for r in l]) for s, l in itertools.groupby(sorted_returns_seq, key=by_state) }
def fraction_of_days_oos(self, policy: Policy[InventoryState, int], time_steps: int, num_traces: int) -> float: impl_mrp: MarkovRewardProcess[InventoryState] =\ self.apply_policy(policy) count: int = 0 high_fractile: int = int(poisson(self.poisson_lambda).ppf(0.98)) start: InventoryState = random.choice( [InventoryState(i, 0) for i in range(high_fractile + 1)]) for _ in range(num_traces): steps = itertools.islice( impl_mrp.simulate_reward(Constant(NonTerminal(start))), time_steps) for step in steps: if step.reward < -self.holding_cost * step.state.state.on_hand: count += 1 return float(count) / (time_steps * num_traces)
def get_sarsa_vf_and_policy( self, states_actions_dict: Mapping[Cell, Set[Move]], sample_func: Callable[[Cell, Move], Tuple[Cell, float]], episodes: int = 10000, step_size: float = 0.01 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]: ''' states_actions_dict gives us the set of possible moves from a non-terminal cell. sample_func is a function with two inputs: state and action, and with output as a sampled pair of (next_state, reward). ''' q: Dict[Cell, Dict[Move, float]] = \ {s: {a: 0. for a in actions} for s, actions in states_actions_dict.items()} nt_states: CellSet = {s for s in q} uniform_states: Choose[Cell] = Choose(nt_states) for episode_num in range(episodes): epsilon: float = 1.0 / (episode_num + 1) state: Cell = uniform_states.sample() action: Move = WindyGrid.epsilon_greedy_action(state, q, epsilon) while state in nt_states: next_state, reward = sample_func(state, action) if next_state in nt_states: next_action: Move = WindyGrid.epsilon_greedy_action( next_state, q, epsilon) q[state][action] += step_size * \ (reward + q[next_state][next_action] - q[state][action]) action = next_action else: q[state][action] += step_size * (reward - q[state][action]) state = next_state vf_dict: V[Cell] = { NonTerminal(s): max(d.values()) for s, d in q.items() } policy: FiniteDeterministicPolicy[Cell, Move] = \ FiniteDeterministicPolicy( {s: max(d.items(), key=itemgetter(1))[0] for s, d in q.items()} ) return vf_dict, policy
european_price: float = european_put_price(spot_price=spot_price_val, expiry=expiry_val, rate=rate_val, vol=vol_val, strike=strike_val) opt_ex_bin_tree: OptimalExerciseBinTree = OptimalExerciseBinTree( spot_price=spot_price_val, payoff=lambda _, x: max(strike_val - x, 0), expiry=expiry_val, rate=rate_val, vol=vol_val, num_steps=100) vf_seq, policy_seq = zip(*opt_ex_bin_tree.get_opt_vf_and_policy()) bin_tree_price: float = vf_seq[0][NonTerminal(0)] bin_tree_ex_boundary: Sequence[Tuple[float, float]] = \ opt_ex_bin_tree.option_exercise_boundary(policy_seq, False) bin_tree_x, bin_tree_y = zip(*bin_tree_ex_boundary) lspi_x, lspi_y = put_option_exercise_boundary(func=flspi, expiry=expiry_val, num_steps=num_steps_lspi, strike=strike_val) dql_x, dql_y = put_option_exercise_boundary(func=fdql, expiry=expiry_val, num_steps=num_steps_dql, strike=strike_val) plot_list_of_curves(list_of_x_vals=[lspi_x, dql_x, bin_tree_x], list_of_y_vals=[lspi_y, dql_y, bin_tree_y], list_of_colors=["b", "r", "g"],
def sample_func(state: Cell, action: Move) -> Tuple[Cell, float]: s, r = mdp.step(NonTerminal(state), action).sample() return s.state, r
if __name__ == '__main__': from pprint import pprint from rl.gen_utils.plot_funcs import plot_list_of_curves from rl.markov_process import NonTerminal villagers: int = 20 vampire_mdp: VampireMDP = VampireMDP(villagers) true_vf, true_policy = vampire_mdp.vi_vf_and_policy() pprint(true_vf) print(true_policy) lspi_vf, lspi_policy = vampire_mdp.lspi_vf_and_policy() pprint(lspi_vf) print(lspi_policy) states = range(1, villagers + 1) true_vf_vals = [true_vf[NonTerminal(s)] for s in states] lspi_vf_vals = [lspi_vf[NonTerminal(s)] for s in states] true_policy_actions = [true_policy.action_for[s] for s in states] lspi_policy_actions = [lspi_policy.action_for[s] for s in states] plot_list_of_curves( [states, states], [true_vf_vals, lspi_vf_vals], ["r", "b"], ["True Optimal VF", "LSPI-Estimated Optimal VF"], x_label="States", y_label="Optimal Values", title="True Optimal VF versus LSPI-Estimated Optimal VF") plot_list_of_curves( [states, states], [true_policy_actions, lspi_policy_actions], ["r", "b"], ["True Optimal Policy", "LSPI-Estimated Optimal Policy"], x_label="States", y_label="Optimal Actions",
def deter_policy(state: S) -> A: return max( ((mdp.step(NonTerminal(state), a).expectation(return_), a) for a in mdp.actions(NonTerminal(state))), key=itemgetter(0))[1]
def deter_policy(state: S) -> A: return max(((res.expectation(return_), a) for a, res in step[NonTerminal(state)].items()), key=itemgetter(0))[1]
import itertools import rl.iterate as iterate given_data: Sequence[Sequence[Tuple[str, float]]] = [ [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)], [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)], [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)], [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)], [('B', 8.), ('B', 2.)] ] gamma: float = 0.9 fixed_traces: Sequence[Sequence[TransitionStep[str]]] = \ [[TransitionStep( state=NonTerminal(s), reward=r, next_state=NonTerminal(trace[i+1][0]) if i < len(trace) - 1 else Terminal('T') ) for i, (s, r) in enumerate(trace)] for trace in given_data] a: NonTerminal[str] = NonTerminal('A') b: NonTerminal[str] = NonTerminal('B') # fa: LinearFunctionApprox[NonTerminal[str]] = LinearFunctionApprox.create( # feature_functions=[ # lambda x: 1.0 if x == a else 0., # lambda y: 1.0 if y == b else 0. # ], # adam_gradient=AdamGradient( # learning_rate=0.1,
def next_state(state=state): switch_states = Bernoulli(self.p).sample() next_st: bool = not state.state if switch_states else state.state return NonTerminal(next_st)