an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') q.clear_states() q.retrain() mdp.pi = QPolicy(q) #print q.Q.get(State(2, 12), -1) #print len(q.states) #q.animate = True plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states) for j in range(5): q.clear_states() for i in range(50): q.rollout() q.guide() q.retrain() plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states) q.rollout() a = mdp.pi.get_next(State(0, 0)) print "action: " + str(a) tup = q.Q.preprocess(0, 0, a) print q.Q.dataset[tup] print "Actual: " + str(np.mean(q.Q.dataset[tup])) print "predicted: " + str(q.Q.get(State(0, 0), a)) for ac in mdp.pi.available_actions: if ac != a: print "Seeing for action: " + str(ac)