Exemplo n.º 1
0
    print("mdp.get_next_states('s1', 'a0') = ",
          mdp.get_next_states('s1', 'a0'))
    print("mdp.get_reward('s1', 'a0', 's0') = ",
          mdp.get_reward('s1', 'a0', 's0'))
    print("mdp.get_transition_prob('s1', 'a0', 's0') = ",
          mdp.get_transition_prob('s1', 'a0', 's0'))

    visualize = True
    from mdp import has_graphviz

    print('Graphviz available: ', has_graphviz)

    if has_graphviz and visualize:
        from mdp import plot_graph, plot_graph_with_state_values, plot_graph_optimal_strategy_and_state_values

        plot_graph(mdp).render()

    # Complete get_action_value().
    check_generate_session_func(mdp, get_action_value)

    # Complete get_new_state_value()
    check_get_new_state_value_func(mdp, get_new_state_value)

    # Let's combine everything together

    # Complete rl_value_iteration()

    # Test rl_value_iteration()
    num_iter = 100  # Maximum iterations, excluding initialization
    min_difference = 0.001  # stop Value Iteration if new values are this close to old values (or closer)
Exemplo n.º 2
0
from mdp_get_action_value import get_action_value
from mdp import FrozenLakeEnv
import matplotlib.pyplot as plt
from IPython.display import clear_output
from time import sleep
from mdp import has_graphviz
from IPython.display import display
print("Graphviz available:", has_graphviz)

mdp = MDP(transition_probs, rewards, initial_state='s0')

if has_graphviz:
    from mdp import plot_graph, plot_graph_with_state_values, \
        plot_graph_optimal_strategy_and_state_values

    display(plot_graph(mdp))

test_Vs = {s: i for i, s in enumerate(sorted(mdp.get_all_states()))}
assert np.isclose(get_action_value(mdp, test_Vs, 's2', 'a1', 0.9), 0.69)
assert np.isclose(get_action_value(mdp, test_Vs, 's1', 'a0', 0.9), 3.95)

test_Vs_copy = dict(test_Vs)
assert np.isclose(get_new_state_value(mdp, test_Vs, 's0', 0.9), 1.8)
assert np.isclose(get_new_state_value(mdp, test_Vs, 's2', 0.9), 1.08)
assert test_Vs == test_Vs_copy, "please do not change state_values in get_new_state_value"

# parameters
gamma = 0.9  # discount for MDP
num_iter = 100  # maximum iterations, excluding initialization
# stop VI if new values are this close to old values (or closer)
min_difference = 0.001
# __Note:__ Installing graphviz on some OS (esp. Windows) may be tricky. However, you can ignore this part alltogether and use the standart vizualization.

# In[5]:


from mdp import has_graphviz
from IPython.display import display
print("Graphviz available:", has_graphviz)


# In[6]:


if has_graphviz:
    from mdp import plot_graph, plot_graph_with_state_values, plot_graph_optimal_strategy_and_state_values
    display(plot_graph(mdp, graph_size="50,50"))


# ### Value Iteration
# 
# Now let's build something to solve this MDP. The simplest algorithm so far is __V__alue __I__teration
# 
# Here's the pseudo-code for VI:
# 
# ---
# 
# `1.` Initialize $V^{(0)}(s)=0$, for all $s$
# 
# `2.` For $i=0, 1, 2, \dots$
#  
# `3.` $ \quad V_{(i+1)}(s) = \max_a \sum_{s'} P(s' | s,a) \cdot [ r(s,a,s') + \gamma V_{i}(s')]$, for all $s$