Exemplo n.º 1
0
    def test_converge(self):
        def close(a, b):
            return abs(a - b) < 0.1

        ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1))
        self.assertAlmostEqual(converged(ns, close), 0.33, places=2)

        ns = (1.0 / n for n in iterate(lambda x: x + 1, start=1))
        all_ns = [1.0, 0.5, 0.33]
        for got, expected in zip(converge(ns, close), all_ns):
            self.assertAlmostEqual(got, expected, places=2)
Exemplo n.º 2
0
def policy_iteration(
    mdp: MarkovDecisionProcess[S, A],
    γ: float,
    approx_v_0: FunctionApprox[S],
    non_terminal_states_distribution: Distribution[S],
    num_state_samples: int
) -> Iterator[Tuple[FunctionApprox[S], ThisPolicy[S, A]]]:


    def update(vf_policy: Tuple[FunctionApprox[S], ThisPolicy[S, A]]) \
            -> Tuple[FunctionApprox[S], ThisPolicy[S, A]]:

        nt_states: Sequence[S] = non_terminal_states_distribution\
            .sample_n(num_state_samples)

        vf, pi = vf_policy
        mrp: MarkovRewardProcess[S] = mdp.apply_policy(pi)
        new_vf: FunctionApprox[S] = converged(
            evaluate_mrp(mrp, γ, vf, non_terminal_states_distribution, num_state_samples),
            done=lambda a, b: a.within(b, 1e-4)
        )

        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + γ * new_vf.evaluate([s1]).item()

        return (new_vf.update([(s, max(mdp.step(s, a).expectation(return_)
                                       for a in mdp.actions(s))) for s in nt_states]),
                ThisPolicy(mdp, return_))

    def return_(s_r: Tuple[S, float]) -> float:
        s1, r = s_r
        return r + γ * approx_v_0.evaluate([s1]).item()

    return iterate(update, (approx_v_0, ThisPolicy(mdp, return_)))
def value_iteration(
    mdp: MarkovDecisionProcess[S, A],
    γ: float,
    approx_0: FunctionApprox[S],
    non_terminal_states_distribution: Distribution[S],
    num_state_samples: int,
) -> Iterator[FunctionApprox[S]]:
    """Iteratively calculate the Optimal Value function for the given
    Markov Decision Process, using the given FunctionApprox to approximate the
    Optimal Value function at each step for a random sample of the process'
    non-terminal states.

    """

    def update(v: FunctionApprox[S]) -> FunctionApprox[S]:
        nt_states: Sequence[S] = non_terminal_states_distribution.sample_n(
            num_state_samples
        )

        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + γ * v.evaluate([s1]).item()

        return v.update(
            [
                (s, max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s)))
                for s in nt_states
            ]
        )

    return iterate(update, approx_0)
Exemplo n.º 4
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A],
    gamma: float,
    matrix_method_for_mrp_eval: bool = False
) -> Iterator[Tuple[V[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[V[S], FinitePolicy[S, A]])\
            -> Tuple[V[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        policy_vf: V[S] = {mrp.non_terminal_states[i]: v for i, v in
                           enumerate(mrp.get_value_function_vec(gamma))}\
            if matrix_method_for_mrp_eval else evaluate_mrp_result(mrp, gamma)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_vf(
            mdp, policy_vf, gamma)

        return policy_vf, improved_pi

    v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states}
    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (v_0, pi_0))
def evaluate_mrp(
    mrp: MarkovRewardProcess[S],
    γ: float,
    approx_0: FunctionApprox[S],
    non_terminal_states_distribution: Distribution[S],
    num_state_samples: int,
) -> Iterator[FunctionApprox[S]]:
    """Iteratively calculate the value function for the given Markov Reward
    Process, using the given FunctionApprox to approximate the value function
    at each step for a random sample of the process' non-terminal states.
    """

    def update(v: FunctionApprox[S]) -> FunctionApprox[S]:
        nt_states: Sequence[S] = non_terminal_states_distribution.sample_n(
            num_state_samples
        )

        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + γ * v.evaluate([s1]).item()

        return v.update(
            [(s, mrp.transition_reward(s).expectation(return_)) for s in nt_states]
        )

    return iterate(update, approx_0)
def value_iteration_finite(
    mdp: FiniteMarkovDecisionProcess[S, A],
    γ: float,
    approx_0: FunctionApprox[S]
) -> Iterator[FunctionApprox[S]]:
    '''Iteratively calculate the Optimal Value function for the given finite
    Markov Decision Process, using the given FunctionApprox to approximate the
    Optimal Value function at each step

    '''
    def update(v: FunctionApprox[S]) -> FunctionApprox[S]:

        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + γ * v.evaluate([s1]).item()

        return v.update(
            [(
                s,
                max(mdp.mapping[s][a].expectation(return_)
                    for a in mdp.actions(s))
            ) for s in mdp.non_terminal_states]
        )

    return iterate(update, approx_0)
def value_iteration(
        mdp: MarkovDecisionProcess[S, A], γ: float,
        approx_0: ValueFunctionApprox[S],
        non_terminal_states_distribution: NTStateDistribution[S],
        num_state_samples: int) -> Iterator[ValueFunctionApprox[S]]:
    '''Iteratively calculate the Optimal Value function for the given
    Markov Decision Process, using the given FunctionApprox to approximate the
    Optimal Value function at each step for a random sample of the process'
    non-terminal states.

    '''
    def update(v: ValueFunctionApprox[S]) -> ValueFunctionApprox[S]:
        nt_states: Sequence[NonTerminal[S]] = \
            non_terminal_states_distribution.sample_n(num_state_samples)

        def return_(s_r: Tuple[State[S], float]) -> float:
            s1, r = s_r
            return r + γ * extended_vf(v, s1)

        return v.update([
            (s,
             max(mdp.step(s, a).expectation(return_) for a in mdp.actions(s)))
            for s in nt_states
        ])

    return iterate(update, approx_0)
Exemplo n.º 8
0
def evaluate_mrp(mrp: FiniteMarkovRewardProcess[S],
                 gamma: float) -> Iterator[np.ndarray]:
    """Iteratively calculate the value function for the give Markov reward
    process.
    """
    def update(v: np.ndarray) -> np.ndarray:
        return mrp.reward_function_vec + gamma * mrp.get_transition_matrix(
        ).dot(v)

    v_0: np.ndarray = np.zeros(len(mrp.non_terminal_states))

    return iterate(update, v_0)
Exemplo n.º 9
0
def evaluate_finite_mrp(
        mrp: FiniteMarkovRewardProcess[S], γ: float,
        approx_0: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]:
    '''Iteratively calculate the value function for the give finite Markov
    Reward Process, using the given FunctionApprox to approximate the value
    function at each step.
    '''
    def update(v: FunctionApprox[S]) -> FunctionApprox[S]:
        vs: np.ndarray = v.evaluate(mrp.non_terminal_states)
        updated: np.ndarray = mrp.reward_function_vec + γ * \
            mrp.get_transition_matrix().dot(vs)
        return v.update(zip(mrp.states(), updated))

    return iterate(update, approx_0)
Exemplo n.º 10
0
def value_iteration(mdp: FiniteMarkovDecisionProcess[S, A],
                    gamma: float) -> Iterator[V[S]]:
    """Calculate the value function (V*) of the given MDP by applying the
    update function repeatedly until the values converge.
    """
    def update(v: V[S]) -> V[S]:
        return {
            s: max(mdp.mapping[s][a].expectation(
                lambda s_r: s_r[1] + gamma * v.get(s_r[0], 0.0))
                   for a in mdp.actions(s))
            for s in v
        }

    v_0: V[S] = {s: 0.0 for s in mdp.non_terminal_states}
    return iterate(update, v_0)
Exemplo n.º 11
0
def approximate_policy_evaluation(mdp: FiniteMarkovDecisionProcess[S, A],
                                  policy: FinitePolicy[S, A],
                                  vf: FunctionApprox[S],
                                  gamma: float) -> Iterator[FunctionApprox[S]]:
    def update(v: FunctionApprox[S]) -> Iterator[FunctionApprox[S]]:
        def return_(s_r: Tuple[S, float]) -> float:
            s1, r = s_r
            return r + gamma * v.evaluate([s1]).item()

        #print(type(v))
        return v.update([
            (s, mdp.mapping[s][policy.policy_map[s]].expectation(return_))
            for s in mdp.non_terminal_states
        ])

    return iterate(update, vf)
def evaluate_mrp(mrp: MarkovRewardProcess[S], γ: float,
                 approx_0: ValueFunctionApprox[S],
                 non_terminal_states_distribution: NTStateDistribution[S],
                 num_state_samples: int) -> Iterator[ValueFunctionApprox[S]]:
    '''Iteratively calculate the value function for the given Markov Reward
    Process, using the given FunctionApprox to approximate the value function
    at each step for a random sample of the process' non-terminal states.

    '''
    def update(v: ValueFunctionApprox[S]) -> ValueFunctionApprox[S]:
        nt_states: Sequence[NonTerminal[S]] = \
            non_terminal_states_distribution.sample_n(num_state_samples)

        def return_(s_r: Tuple[State[S], float]) -> float:
            s1, r = s_r
            return r + γ * extended_vf(v, s1)

        return v.update([(s, mrp.transition_reward(s).expectation(return_))
                         for s in nt_states])

    return iterate(update, approx_0)
Exemplo n.º 13
0
def policy_iteration(
    mdp: FiniteMarkovDecisionProcess[S, A], gamma: float,
    approx0: FunctionApprox[S]
) -> Iterator[Tuple[FunctionApprox[S], FinitePolicy[S, A]]]:
    '''Calculate the value function (V*) of the given MDP by improving
    the policy repeatedly after evaluating the value function for a policy
    '''

    def update(vf_policy: Tuple[FunctionApprox[S], FinitePolicy[S, A]])\
            -> Tuple[FunctionApprox[S], FinitePolicy[S, A]]:

        vf, pi = vf_policy
        mrp: FiniteMarkovRewardProcess[S] = mdp.apply_finite_policy(pi)
        #policy_vf: FunctionApprox[S] = approximate_policy_evaluation_result(mdp,pi,vf)
        policy_vf: FunctionApprox[S] = evaluate_mrp_result(mrp, gamma, vf)
        improved_pi: FinitePolicy[S, A] = greedy_policy_from_approx_vf(
            mdp, policy_vf, gamma)
        return policy_vf, improved_pi

    pi_0: FinitePolicy[S, A] = FinitePolicy(
        {s: Choose(set(mdp.actions(s)))
         for s in mdp.non_terminal_states})
    return iterate(update, (approx0, pi_0))
Exemplo n.º 14
0
 def test_iterate(self):
     ns = iterate(lambda x: x + 1, start=0)
     self.assertEqual(list(itertools.islice(ns, 5)), list(range(0, 5)))