def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]], gamma: float, num_episodes: int) -> Mapping[S, float]: return iterate.last( itertools.islice( mc.evaluate_mrp(traces=episodes_stream, approx_0=Tabular(), γ=gamma, tolerance=1e-10), num_episodes)).values_map
def mc_finite_prediction_equal_wts( fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float, initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) return mc.evaluate_mrp(traces=episodes, approx_0=Tabular(values_map=initial_vf_dict), γ=gamma, tolerance=tolerance)
def mc_prediction_learning_rate( mrp: MarkovRewardProcess[S], start_state_distribution: Distribution[S], gamma: float, tolerance: float, initial_func_approx: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ mrp_episodes_stream(mrp, start_state_distribution) return mc.evaluate_mrp(traces=episodes, approx_0=initial_func_approx, γ=gamma, tolerance=tolerance)
def test_evaluate_finite_mrp(self): start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()}) traces = self.finite_flip_flop.reward_traces(Choose({True, False})) v = iterate.converged( mc.evaluate_mrp(traces, γ=0.99, approx_0=start), # Loose bound of 0.025 to speed up test. done=lambda a, b: a.within(b, 0.025)) self.assertEqual(len(v.values_map), 2) for s in v.values_map: # Intentionally loose bound—otherwise test is too slow. # Takes >1s on my machine otherwise. self.assertLess(abs(v(s) - 170), 1.0)
def mc_finite_prediction_learning_rate( fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float, initial_learning_rate: float, half_life: float, exponent: float, initial_vf_dict: Mapping[S, float]) -> Iterator[FunctionApprox[S]]: episodes: Iterable[Iterable[TransitionStep[S]]] = \ fmrp_episodes_stream(fmrp) learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) return mc.evaluate_mrp(traces=episodes, approx_0=Tabular( values_map=initial_vf_dict, count_to_weight_func=learning_rate_func), γ=gamma, tolerance=tolerance)
si_mrp = SimpleInventoryMRPFinite(capacity=user_capacity, poisson_lambda=user_poisson_lambda, holding_cost=user_holding_cost, stockout_cost=user_stockout_cost) print("Value Function (Exact)") print("--------------") si_mrp.display_value_function(gamma=user_gamma) print() print("Value Function (MC Function Approximation)") print("--------------") traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states))) it: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp( traces=traces, approx_0=Tabular(), γ=user_gamma) num_traces = 10000 last_vf_mc: FunctionApprox[InventoryState] = last(islice(it, num_traces)) pprint({ s: round(last_vf_mc.evaluate([s])[0], 3) for s in si_mrp.non_terminal_states }) print() print("Value Function (Tabular MC from scratch)") print("--------------") traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \ si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states))) it: Iterator[Dict[InventoryState, float]] = evaluate_mrp_mc( traces=traces, vf={s: 0
itertools.chain.from_iterable( itertools.islice(trace, episode_length) for trace in traces ) num_episodes = 100000 print("Value Function (TD Function Approximation)") print("--------------") initial_learning_rate: float = 0.03 half_life: float = 1000.0 exponent: float = 0.5 learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent) td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp( transitions=unit_experiences_accumulated, approx_0=Tabular(count_to_weight_func=learning_rate_func), γ=user_gamma) final_td_vf: FunctionApprox[InventoryState] = \ last(itertools.islice(td_vfs, episode_length * num_episodes)) pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states}) print() print("Value Function (Tabular MC from scratch)") print("--------------") td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt( transitions=unit_experiences_accumulated, vf={s: 0 for s in si_mrp.non_terminal_states}, γ=user_gamma) final_td_vf: Dict[InventoryState, float] = \ last(itertools.islice(td_vfs, episode_length * num_episodes))