(1, 0.1), ], no_action_probability=0.0) mdp_solvers = { 'Value Iteration': gw.run_value_iterations, 'Policy Iteration': gw.run_policy_iterations } for solver_name, solver_fn in mdp_solvers.items(): print('Final result of {}:'.format(solver_name)) policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5) print(policy_grids[:, :, -1]) print(utility_grids[:, :, -1]) plt.figure() gw.plot_policy(utility_grids[:, :, -1]) plot_convergence(utility_grids, policy_grids) plt.show() ql = QLearner(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 flat_policies, flat_utilities = ql.learn(start_state,