# Let's combine everything together # Complete rl_value_iteration() # Test rl_value_iteration() num_iter = 100 # Maximum iterations, excluding initialization min_difference = 0.001 # stop Value Iteration if new values are this close to old values (or closer) init_values = {state: 0 for state in mdp.get_all_states()} state_values, _ = rl_value_iteration(mdp, gamma, num_iter, min_difference, init_values) # Draw state_values after training. if has_graphviz and visualize: plot_graph_with_state_values( mdp, state_values).render(filename='MDP_with_states') print('Final state values:', state_values) check_state_values(state_values) # Complete get_optimal_action function. check_get_optimal_action(get_optimal_action, mdp, state_values, gamma) # Visualize optimal strategy. if has_graphviz and visualize: plot_graph_optimal_strategy_and_state_values( mdp, state_values, get_action_value, gamma).render(filename='MDP_with_opt_strategy') print([ get_optimal_action(mdp, state_values, s, gamma=0.9)
assert isinstance(new_state_values, dict) # Compute difference diff = max( abs(new_state_values[s] - state_values[s]) for s in mdp.get_all_states()) print("iter %4i | diff: %6.5f | " % (i, diff), end="") print(' '.join("V(%s) = %.3f" % (s, v) for s, v in state_values.items())) state_values = new_state_values if diff < min_difference: print("Terminated") break if has_graphviz: display(plot_graph_with_state_values(mdp, state_values)) print("Final state values:", state_values) assert abs(state_values['s0'] - 3.781) < 0.01 assert abs(state_values['s1'] - 7.294) < 0.01 assert abs(state_values['s2'] - 4.202) < 0.01 assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1' assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0' assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a1' if has_graphviz: try: display(plot_graph_optimal_strategy_and_state_values( mdp, state_values))
abs(new_state_values[s] - state_values[s]) for s in mdp.get_all_states()) print("iter %4i | diff: %6.5f | " % (i, diff), end="") print(' '.join("V(%s) = %.3f" % (s, v) for s, v in state_values.items()), end='\n\n') state_values = new_state_values if diff < min_difference: print("Terminated") break if has_graphviz: plot_graph_with_state_values(mdp, state_values).render(view=True) print("Final state values:", state_values) assert abs(state_values['s0'] - 8.032) < 0.01 assert abs(state_values['s1'] - 11.169) < 0.01 assert abs(state_values['s2'] - 8.921) < 0.01 assert get_optimal_action(mdp, state_values, 's0', gamma) == 'a1' assert get_optimal_action(mdp, state_values, 's1', gamma) == 'a0' assert get_optimal_action(mdp, state_values, 's2', gamma) == 'a0' if has_graphviz: try: plot_graph_optimal_strategy_and_state_values( mdp, state_values).render(view=True)