plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states) q = QLearner(grid, mdp, moves=20) q.Q = Qapprox(H, W) q.animate = False for i in range(20): q.guide() #for key in q.Q.dataset.keys(): # print key, ",", np.mean(q.Q.dataset[key]) an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') q.clear_states() q.retrain() mdp.pi = QPolicy(q) #print q.Q.get(State(2, 12), -1) #print len(q.states) #q.animate = True plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states) for j in range(5): q.clear_states() for i in range(50): q.rollout() q.guide() q.retrain()