Пример #1
0
plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states)


q = QLearner(grid, mdp, moves=20)
q.Q = Qapprox(H, W)
q.animate = False
for i in range(20):
    q.guide()
#for key in q.Q.dataset.keys():
#    print key, ",", np.mean(q.Q.dataset[key])


an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
            desc='Q policy')
q.clear_states()
q.retrain()
mdp.pi = QPolicy(q)
#print q.Q.get(State(2, 12), -1)
#print len(q.states)
#q.animate = True

plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)


for j in range(5):
    q.clear_states()
    for i in range(50):
        q.rollout()
        q.guide()
    q.retrain()