Пример #1
0
mdp = ClassicMDP(ClassicPolicy(grid), grid)

#mdp.value_iteration()
#mdp.save_policy(filename='scen1.p')
mdp.load_policy(filename='scen1.p')

value_iter_pi = mdp.pi

plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states)


q = QLearner(grid, mdp, moves=20)
q.Q = Qapprox(H, W)
q.animate = False
for i in range(20):
    q.guide()
#for key in q.Q.dataset.keys():
#    print key, ",", np.mean(q.Q.dataset[key])


an = Analysis(W, H, ITER, rewards=rewards, sinks=sinks,
            desc='Q policy')
q.clear_states()
q.retrain()
mdp.pi = QPolicy(q)
#print q.Q.get(State(2, 12), -1)
#print len(q.states)
#q.animate = True

plotter.plot_state_actions(mdp.pi, rewards = grid.reward_states, sinks = grid.sink_states)