예제 #1
0
def value_iteration_optimality(mdp, gamma):
    it = value_iteration(mdp, gamma)
    vf1 = next(it)
    vf2 = next(it)
    while max(abs(np.array(list(vf2.values())) -
                  np.array(list(vf1.values())))) != 0.0:
        vf3 = next(it)
        vf1 = vf2
        vf2 = vf3
    opt_vf = vf2
    opt_policy = greedy_policy_from_vf(mdp, opt_vf, gamma)
    #print("opt_vf : ", opt_vf)
    #print("opt_policy : ", opt_policy)
    return (opt_vf, opt_policy)
예제 #2
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess,
    gamma: float,
    tolerance: float,
    max_iters: int
) -> Tuple[V[S], FinitePolicy]:
    """Implement policy iteration on a finite MDP.

    :param mdp: Object representation of a finite Markov decision process
    :param gamma: Discount factor
    :param tolerance: Difference in maximum value functions between iterations
        for convergence
    :param max_iters: Maximum number of iterations to allow
    :returns: Optimal value function
    :returns: Optimal policy
    """
    vf, pi = initialize(mdp)
    n_iter = 0

    while True:

        n_iter += 1
        delta = 0
        v = vf.copy()
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)

        # Policy evaluation
        vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in enumerate(
            mrp.get_value_function_vec(gamma)
        )}
        diffs = np.absolute(np.subtract(list(vf.values()), list(v.values())))
        diffs = np.append(diffs, delta)
        delta = np.max(diffs)

        # Policy improvement
        pi: FinitePolicy[S, A] = dp.greedy_policy_from_vf(
            mdp, vf, gamma
        )

        if n_iter == max_iters:
            print("Maximum iterations reached.")
            return vf, pi
        if delta < tolerance:
            return vf, pi
예제 #3
0
def frog_escape_value_iteration(frog_MDP: fe.FrogProblemMDP,
                                gamma: float = 1,
                                max_iters: int = 1000) -> mdp.FinitePolicy:
    """Get optimal policy for 'Frog Escape' problem by value iteration.

    :param frog_MDP: MDP representation of the 'Frog Escape' problem.
    :param gamma: Discount factor (default 1)
    :param max_iters: Maximum number of value iterations to execute (default
        1000)
    """
    value_iterator: Iterator[dp.V[dp.S]] = dp.value_iteration(mdp=frog_MDP,
                                                              gamma=gamma)
    vf_errors: Tuple[dp.V[dp.S], np.ndarray] = iterate(iterator=value_iterator,
                                                       max_iters=max_iters)
    pi: mdp.FinitePolicy[dp.S,
                         dp.A] = dp.greedy_policy_from_vf(mdp=frog_MDP,
                                                          vf=vf_errors[0][0],
                                                          gamma=gamma)
    return (vf_errors[0][0], pi), vf_errors[1]
예제 #4
0
    def solution(
        model: FiniteMarkovDecisionProcess[Coordinate, Action], gamma: float
    ) -> Tuple[int, Mapping[Coordinate, float], FinitePolicy[Coordinate,
                                                             Action]]:
        count = 0
        v = value_iteration(model, gamma)
        a = next(v, None)
        while True:
            if a is None:
                break
            b = next(v, None)
            if max(abs(a[s] - b[s]) for s in a) < TOLERANCE:
                break
            a = b
            count += 1

        opt_policy: FinitePolicy[Coordinate, Action] = greedy_policy_from_vf(
            model, b, gamma)
        return count, b, opt_policy