mdp = ClassicMDP(ClassicPolicy(grid), grid) #mdp.value_iteration() #mdp.save_policy(filename='scen1.p') mdp.load_policy(filename='scen1.p') value_iter_pi = mdp.pi plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states) q = QLearner(grid, mdp, moves=20) q.Q = Qapprox(H, W) q.animate = False for i in range(20): q.guide() #for key in q.Q.dataset.keys(): # print key, ",", np.mean(q.Q.dataset[key]) an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') q.clear_states() q.retrain() mdp.pi = QPolicy(q) #print q.Q.get(State(2, 12), -1) #print len(q.states) #q.animate = True plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)