Пример #1
0
def glie_mc_finite_learning_rate_correctness(
        fmdp: FiniteMarkovDecisionProcess[S, A], initial_learning_rate: float,
        half_life: float, exponent: float, gamma: float,
        epsilon_as_func_of_episodes: Callable[[int], float],
        episode_length_tolerance: float, num_episodes: int) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        glie_mc_finite_control_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
            episode_length_tolerance=episode_length_tolerance
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_episodes))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"GLIE MC Optimal Value Function with {num_episodes:d} episodes")
    pprint(opt_vf)
    print(f"GLIE MC Optimal Policy with {num_episodes:d} episodes")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
Пример #2
0
    def test_evaluate_finite_mrp(self) -> None:
        start = Tabular(
            {s: 0.0
             for s in self.finite_flip_flop.states()},
            count_to_weight_func=lambda _: 0.1,
        )

        episode_length = 20
        episodes: Iterable[Iterable[
            mp.TransitionStep[bool]]] = self.finite_flip_flop.reward_traces(
                Choose({True, False}))
        transitions: Iterable[
            mp.TransitionStep[bool]] = itertools.chain.from_iterable(
                itertools.islice(episode, episode_length)
                for episode in episodes)

        vs = td.td_prediction(transitions, γ=0.99, approx_0=start)

        v: Optional[Tabular[bool]] = iterate.last(
            itertools.islice(cast(Iterator[Tabular[bool]], vs), 10000))

        if v is not None:
            self.assertEqual(len(v.values_map), 2)

            for s in v.values_map:
                # Intentionally loose bound—otherwise test is too slow.
                # Takes >1s on my machine otherwise.
                self.assertLess(abs(v(s) - 170), 3.0)
        else:
            assert False
Пример #3
0
def q_learning_finite_learning_rate_correctness(
    fmdp: FiniteMarkovDecisionProcess[S, A],
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    gamma: float,
    epsilon: float,
    max_episode_length: int,
    num_updates: int,
) -> None:
    qvfs: Iterator[QValueFunctionApprox[S, A]] = \
        q_learning_finite_learning_rate(
            fmdp=fmdp,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            gamma=gamma,
            epsilon=epsilon,
            max_episode_length=max_episode_length
        )
    final_qvf: QValueFunctionApprox[S, A] = \
        iterate.last(itertools.islice(qvfs, num_updates))
    opt_vf, opt_policy = get_vf_and_policy_from_qvf(mdp=fmdp, qvf=final_qvf)

    print(f"Q-Learning ptimal Value Function with {num_updates:d} updates")
    pprint(opt_vf)
    print(f"Q-Learning Optimal Policy with {num_updates:d} updates")
    print(opt_policy)

    true_opt_vf, true_opt_policy = value_iteration_result(fmdp, gamma=gamma)

    print("True Optimal Value Function")
    pprint(true_opt_vf)
    print("True Optimal Policy")
    print(true_opt_policy)
Пример #4
0
def mc_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    approx_0: ValueFunctionApprox[S],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[ValueFunctionApprox[S]]:
    '''Evaluate an MRP using the monte carlo method, simulating episodes
    of the given number of steps.

    Each value this function yields represents the approximated value
    function for the MRP after one additional epsiode.

    Arguments:
      traces -- an iterator of simulation traces from an MRP
      approx_0 -- initial approximation of value function
      γ -- discount rate (0 < γ ≤ 1), default: 1
      episode_length_tolerance -- stop iterating once γᵏ ≤ tolerance

    Returns an iterator with updates to the approximated value
    function after each episode.

    '''
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces)
    f = approx_0
    yield f

    for episode in episodes:
        f = last(
            f.iterate_updates([(step.state, step.return_)]
                              for step in episode))
        yield f
Пример #5
0
def mc_finite_learning_rate_correctness(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    tolerance: float,
    num_episodes: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[S, float],
) -> None:
    mc_vfs: Iterator[FunctionApprox[S]] = mc_finite_prediction_learning_rate(
        fmrp=fmrp,
        gamma=gamma,
        tolerance=tolerance,
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent,
        initial_vf_dict=initial_vf_dict,
    )
    final_mc_vf: FunctionApprox[S] = iterate.last(
        itertools.islice(mc_vfs, num_episodes)
    )
    print(
        "Decaying-Learning-Rate-MC Value Function with " + f"{num_episodes:d} episodes"
    )
    pprint({s: round(final_mc_vf(s), 3) for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
Пример #6
0
def td_lambda_finite_learning_rate_correctness(
    fmrp: FiniteMarkovRewardProcess[S],
    gamma: float,
    lambd: float,
    episode_length: int,
    num_episodes: int,
    initial_learning_rate: float,
    half_life: float,
    exponent: float,
    initial_vf_dict: Mapping[NonTerminal[S], float]
) -> None:
    td_lambda_vfs: Iterator[ValueFunctionApprox[S]] = \
        td_lambda_finite_prediction_learning_rate(
            fmrp=fmrp,
            gamma=gamma,
            lambd=lambd,
            episode_length=episode_length,
            initial_learning_rate=initial_learning_rate,
            half_life=half_life,
            exponent=exponent,
            initial_vf_dict=initial_vf_dict
        )
    final_td_lambda_vf: ValueFunctionApprox[S] = \
        iterate.last(itertools.islice(
            td_lambda_vfs,
            episode_length * num_episodes
        ))
    print("Decaying-Learning-Rate-TD-Lambda Value Function with " +
          f"{num_episodes:d} episodes")
    pprint({s: round(final_td_lambda_vf(s), 3)
            for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
Пример #7
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[bool, bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.states()
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1,
        )

        uniform_policy: mdp.Policy[bool, bool] = mdp.FinitePolicy({
            s: Choose(self.finite_mdp.actions(s))
            for s in self.finite_mdp.states()
        })

        transitions: Iterable[mdp.TransitionStep[
            bool, bool]] = self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.states()), uniform_policy)

        qs = td.td_control(transitions, self.finite_mdp.actions, q_0, γ=0.99)

        q: Optional[Tabular[Tuple[bool, bool]]] = iterate.last(
            cast(Iterator[Tabular[Tuple[bool, bool]]],
                 itertools.islice(qs, 20000)))

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [True, False]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Пример #8
0
def mc_prediction(episodes_stream: Iterator[Sequence[TransitionStep[S]]],
                  gamma: float, num_episodes: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            mc.mc_prediction(traces=episodes_stream,
                             approx_0=Tabular(),
                             γ=gamma,
                             tolerance=1e-10), num_episodes)).values_map
Пример #9
0
def td_prediction(experiences_stream: Iterator[TransitionStep[S]],
                  gamma: float, num_experiences: int) -> Mapping[S, float]:
    return iterate.last(
        itertools.islice(
            td.td_prediction(
                transitions=experiences_stream,
                approx_0=Tabular(count_to_weight_func=learning_rate_schedule(
                    initial_learning_rate=0.01, half_life=10000,
                    exponent=0.5)),
                γ=gamma), num_experiences)).values_map
Пример #10
0
def mc_finite_equal_wts_correctness(
        fmrp: FiniteMarkovRewardProcess[S], gamma: float, tolerance: float,
        num_episodes: int, initial_vf_dict: Mapping[S, float]) -> None:
    mc_vfs: Iterator[FunctionApprox[S]] = \
        mc_finite_prediction_equal_wts(
            fmrp=fmrp,
            gamma=gamma,
            tolerance=tolerance,
            initial_vf_dict=initial_vf_dict
        )
    final_mc_vf: FunctionApprox[S] = \
        iterate.last(itertools.islice(mc_vfs, num_episodes))
    print(f"Equal-Weights-MC Value Function with {num_episodes:d} episodes")
    pprint({s: round(final_mc_vf(s), 3) for s in fmrp.non_terminal_states})
    print("True Value Function")
    fmrp.display_value_function(gamma=gamma)
Пример #11
0
 def get_q_learning_vf_and_policy(
         self, epsilon: float, learning_rate: float,
         num_updates: int) -> Tuple[V[Cell], FinitePolicy[Cell, Move]]:
     qvfs: Iterator[FunctionApprox[Tuple[Cell, Move]]] = \
         q_learning_finite_learning_rate(
             fmdp=self.get_finite_mdp(),
             initial_learning_rate=learning_rate,
             half_life=1e8,
             exponent=1.0,
             gamma=1.0,
             epsilon=epsilon,
             max_episode_length=int(1e8)
         )
     final_qvf: FunctionApprox[Tuple[Cell, Move]] = \
         iterate.last(itertools.islice(qvfs, num_updates))
     return get_vf_and_policy_from_qvf(mdp=self.get_finite_mdp(),
                                       qvf=final_qvf)
Пример #12
0
    def test_evaluate_finite_mdp(self) -> None:
        q_0: Tabular[Tuple[NonTerminal[bool], bool]] = Tabular(
            {(s, a): 0.0
             for s in self.finite_mdp.non_terminal_states
             for a in self.finite_mdp.actions(s)},
            count_to_weight_func=lambda _: 0.1
        )

        uniform_policy: FinitePolicy[bool, bool] =\
            FinitePolicy({
                s.state: Choose(self.finite_mdp.actions(s))
                for s in self.finite_mdp.non_terminal_states
            })

        transitions: Iterable[mdp.TransitionStep[bool, bool]] =\
            self.finite_mdp.simulate_actions(
                Choose(self.finite_mdp.non_terminal_states),
                uniform_policy
            )

        qs = td.q_learning_external_transitions(
            transitions,
            self.finite_mdp.actions,
            q_0,
            γ=0.99
        )

        q: Optional[Tabular[Tuple[NonTerminal[bool], bool]]] =\
            iterate.last(
                cast(Iterator[Tabular[Tuple[NonTerminal[bool], bool]]],
                     itertools.islice(qs, 20000))
            )

        if q is not None:
            self.assertEqual(len(q.values_map), 4)

            for s in [NonTerminal(True), NonTerminal(False)]:
                self.assertLess(abs(q((s, False)) - 170.0), 2)
                self.assertGreater(q((s, False)), q((s, True)))
        else:
            assert False
Пример #13
0
 def lspi_vf_and_policy(self) -> \
         Tuple[V[int], FiniteDeterministicPolicy[int, int]]:
     transitions: Iterable[TransitionStep[int, int]] = itertools.islice(
         self.lspi_transitions(), 50000)
     qvf_iter: Iterator[LinearFunctionApprox[Tuple[
         NonTerminal[int], int]]] = least_squares_policy_iteration(
             transitions=transitions,
             actions=self.actions,
             feature_functions=self.lspi_features(4, 4),
             initial_target_policy=DeterministicPolicy(
                 lambda s: int(s / 2)),
             γ=1.0,
             ε=1e-5)
     qvf: LinearFunctionApprox[Tuple[NonTerminal[int], int]] = \
         iterate.last(
             itertools.islice(
                 qvf_iter,
                 100
             )
         )
     return get_vf_and_policy_from_qvf(self, qvf)
Пример #14
0
 def get_glie_sarsa_vf_and_policy(
     self,
     epsilon_as_func_of_episodes: Callable[[int], float],
     learning_rate: float,
     num_updates: int
 ) -> Tuple[V[Cell], FiniteDeterministicPolicy[Cell, Move]]:
     qvfs: Iterator[QValueFunctionApprox[Cell, Move]] = \
         glie_sarsa_finite_learning_rate(
             fmdp=self.get_finite_mdp(),
             initial_learning_rate=learning_rate,
             half_life=1e8,
             exponent=1.0,
             gamma=1.0,
             epsilon_as_func_of_episodes=epsilon_as_func_of_episodes,
             max_episode_length=int(1e8)
         )
     final_qvf: QValueFunctionApprox[Cell, Move] = \
         iterate.last(itertools.islice(qvfs, num_updates))
     return get_vf_and_policy_from_qvf(
         mdp=self.get_finite_mdp(),
         qvf=final_qvf
     )
Пример #15
0
    def test_last(self):
        self.assertEqual(last(range(0, 5)), 4)
        self.assertEqual(last(range(0, 10)), 9)

        self.assertRaises(Exception, lambda: last([]))
Пример #16
0
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent
    )
)

episodes: Iterable[Iterable[TransitionStep[InventoryState]]] = \
    si_mrp.reward_traces(Choose(si_mrp.non_terminal_states))
traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        (itertools.islice(episode, episode_length) for episode in episodes)

vf_iter: Iterator[Tabular[NonTerminal[InventoryState]]] = \
    lambda_return_prediction(
        traces=traces,
        approx_0=approx_0,
        γ=gamma,
        lambd=lambda_param
    )

vf: Tabular[NonTerminal[InventoryState]] = \
    iterate.last(itertools.islice(vf_iter, num_episodes))

pprint(vf.values_map)
si_mrp.display_value_function(gamma=gamma)





    q_learning_experience_replay(
        mdp=si_mdp,
        policy_from_q=lambda f, m: epsilon_greedy_policy(
            q=f,
            mdp=m,
            ϵ=epsilon
        ),
        states=Choose(si_mdp.non_terminal_states),
        approx_0=Tabular(
            count_to_weight_func=learning_rate_schedule(
                initial_learning_rate=initial_learning_rate,
                half_life=learning_rate_half_life,
                exponent=learning_rate_exponent
            )
        ),
        γ=gamma,
        max_episode_length=episode_length,
        mini_batch_size=mini_batch_size,
        weights_decay_half_life=time_decay_half_life
    )

qvf: QValueFunctionApprox[InventoryState, int] = iterate.last(
    itertools.islice(q_iter, num_updates))
vf, pol = get_vf_and_policy_from_qvf(mdp=si_mdp, qvf=qvf)
pprint(vf)
print(pol)

true_vf, true_pol = value_iteration_result(mdp=si_mdp, gamma=gamma)
pprint(true_vf)
print(true_pol)
Пример #18
0
    def test_last(self):
        self.assertEqual(last(range(0, 5)), 4)
        self.assertEqual(last(range(0, 10)), 9)

        self.assertEqual(last([]), None)
Пример #19
0
    itertools.islice(transitions, num_transitions)

initial_learning_rate: float = 0.5
half_life: float = 1000
exponent: float = 0.5
approx0: Tabular[NonTerminal[int]] = Tabular(
    count_to_weight_func=learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent))

td_func: Tabular[NonTerminal[int]] = \
    iterate.last(itertools.islice(
        td_prediction(
            transitions=td_transitions,
            approx_0=approx0,
            γ=gamma
        ),
        num_transitions
    ))
td_vf: np.ndarray = td_func.evaluate(nt_states)

num_polynomials: int = 5
features: Sequence[Callable[[NonTerminal[int]], float]] = \
    laguerre_state_features(num_polynomials)
lstd_transitions: Iterable[TransitionStep[int]] = \
    itertools.islice(transitions, num_transitions)
epsilon: float = 1e-4

lstd_func: LinearFunctionApprox[NonTerminal[int]] = \
    least_squares_td(
        transitions=lstd_transitions,
Пример #20
0
        )
    num_episodes = 100000

    print("Value Function (TD Function Approximation)")
    print("--------------")
    initial_learning_rate: float = 0.03
    half_life: float = 1000.0
    exponent: float = 0.5
    learning_rate_func: Callable[[int], float] = learning_rate_schedule(
        initial_learning_rate=initial_learning_rate,
        half_life=half_life,
        exponent=exponent)
    td_vfs: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        transitions=unit_experiences_accumulated,
        approx_0=Tabular(count_to_weight_func=learning_rate_func),
        γ=user_gamma)
    final_td_vf: FunctionApprox[InventoryState] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf(s), 3) for s in si_mrp.non_terminal_states})
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    td_vfs: Iterator[Dict[InventoryState, float]] = evaluate_mrp_dt(
        transitions=unit_experiences_accumulated,
        vf={s: 0
            for s in si_mrp.non_terminal_states},
        γ=user_gamma)
    final_td_vf: Dict[InventoryState, float] = \
        last(itertools.islice(td_vfs, episode_length * num_episodes))
    pprint({s: round(final_td_vf[s], 3) for s in si_mrp.non_terminal_states})
Пример #21
0
replay: Iterator[Sequence[TransitionStep[str]]] = \
    exp_replay_memory.replay(fixed_transitions, 1)


def replay_transitions(replay=replay) -> Iterator[TransitionStep[str]]:
    while True:
        yield next(replay)[0]


num_iterations: int = 100000

td1_vf: ValueFunctionApprox[str] = iterate.last(
    itertools.islice(
        td_prediction(
            replay_transitions(),
            td_fa,
            gamma
        ),
        num_iterations
    )
)

print("Result of Batch TD1 Prediction")
print("V[A] = %.3f" % td1_vf(a))
print("V[B] = %.3f" % td1_vf(b))

td2_vf: ValueFunctionApprox[str] = batch_td_prediction(
    fixed_transitions,
    td_fa,
    gamma
)
Пример #22
0
                                      holding_cost=user_holding_cost,
                                      stockout_cost=user_stockout_cost)

    print("Value Function (Exact)")
    print("--------------")
    si_mrp.display_value_function(gamma=user_gamma)
    print()

    print("Value Function (MC Function Approximation)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[FunctionApprox[InventoryState]] = evaluate_mrp(
        traces=traces, approx_0=Tabular(), γ=user_gamma)
    num_traces = 10000
    last_vf_mc: FunctionApprox[InventoryState] = last(islice(it, num_traces))
    pprint({
        s: round(last_vf_mc.evaluate([s])[0], 3)
        for s in si_mrp.non_terminal_states
    })
    print()

    print("Value Function (Tabular MC from scratch)")
    print("--------------")
    traces: Iterable[Iterable[TransitionStep[InventoryState]]] = \
        si_mrp.reward_traces(Choose(set(si_mrp.non_terminal_states)))
    it: Iterator[Dict[InventoryState, float]] = evaluate_mrp_mc(
        traces=traces,
        vf={s: 0
            for s in si_mrp.non_terminal_states},
        γ=user_gamma)