Пример #1
0
an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
            desc='Q policy')
q.clear_states()
q.retrain()
mdp.pi = QPolicy(q)
#print q.Q.get(State(2, 12), -1)
#print len(q.states)
#q.animate = True

plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)


for j in range(5):
    q.clear_states()
    for i in range(50):
        q.rollout()
        q.guide()
    q.retrain()

plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)
q.rollout()
a = mdp.pi.get_next(State(0, 0))
print "action: " + str(a)
tup = q.Q.preprocess(0, 0, a)
print q.Q.dataset[tup]
print "Actual: " + str(np.mean(q.Q.dataset[tup]))
print "predicted: " + str(q.Q.get(State(0, 0), a))

for ac in mdp.pi.available_actions:
    if ac != a:
        print "Seeing for action: " + str(ac)