Exemplo n.º 1
0
    # Let's combine everything together

    # Complete rl_value_iteration()

    # Test rl_value_iteration()
    num_iter = 100  # Maximum iterations, excluding initialization
    min_difference = 0.001  # stop Value Iteration if new values are this close to old values (or closer)

    init_values = {state: 0 for state in mdp.get_all_states()}
    state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference,
                                         init_values)

    # Draw state_values after training.
    if has_graphviz and visualize:
        plot_graph_with_state_values(
            mdp, state_values).render(filename='MDP_with_states')

    print('Final state values:', state_values)
    check_state_values(state_values)

    # Complete get_optimal_action function.
    check_get_optimal_action(get_optimal_action, mdp, state_values, gamma)

    # Visualize optimal strategy.
    if has_graphviz and visualize:
        plot_graph_optimal_strategy_and_state_values(
            mdp, state_values, get_action_value,
            gamma).render(filename='MDP_with_opt_strategy')

    print([
        get_optimal_action(mdp, state_values, s, gamma=0.9)
Exemplo n.º 2
0
    assert isinstance(new_state_values, dict)

    # Compute difference
    diff = max(
        abs(new_state_values[s] - state_values[s])
        for s in mdp.get_all_states())
    print("iter %4i   |   diff: %6.5f   |   " % (i, diff), end="")
    print('   '.join("V(%s) = %.3f" % (s, v) for s, v in state_values.items()))
    state_values = new_state_values

    if diff < min_difference:
        print("Terminated")
        break

if has_graphviz:
    display(plot_graph_with_state_values(mdp, state_values))

print("Final state values:", state_values)

assert abs(state_values['s0'] - 3.781) < 0.01
assert abs(state_values['s1'] - 7.294) < 0.01
assert abs(state_values['s2'] - 4.202) < 0.01

assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1'
assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0'
assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a1'

if has_graphviz:
    try:
        display(plot_graph_optimal_strategy_and_state_values(
            mdp, state_values))
Exemplo n.º 3
0
        abs(new_state_values[s] - state_values[s])
        for s in mdp.get_all_states())

    print("iter %4i   |   diff: %6.5f   |   " % (i, diff), end="")

    print('   '.join("V(%s) = %.3f" % (s, v) for s, v in state_values.items()),
          end='\n\n')

    state_values = new_state_values

    if diff < min_difference:
        print("Terminated")
        break

if has_graphviz:
    plot_graph_with_state_values(mdp, state_values).render(view=True)

print("Final state values:", state_values)

assert abs(state_values['s0'] - 8.032) < 0.01
assert abs(state_values['s1'] - 11.169) < 0.01
assert abs(state_values['s2'] - 8.921) < 0.01

assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1'
assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0'
assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a0'

if has_graphviz:
    try:
        plot_graph_optimal_strategy_and_state_values(
            mdp, state_values).render(view=True)