print(policy_grids[:, :, -1]) print(utility_grids[:, :, -1]) plt.figure() gw.plot_policy(utility_grids[:, :, -1]) plot_convergence(utility_grids, policy_grids) plt.show() ql = QLearner(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 flat_policies, flat_utilities = ql.learn(start_state, gw.generate_experience, iterations=iterations) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) print('Final result of QLearning:') print(ql_policy_grids[:, :, -1]) print(ql_utility_grids[:, :, -1]) plt.figure() gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1])