value_iter_pi = mdp.pi plotter.plot_state_actions(value_iter_pi, rewards = grid.reward_states, sinks = grid.sink_states) value_iter_data = np.zeros([TRIALS, ITER]) classic_q_data = np.zeros([TRIALS, ITER]) for t in range(TRIALS): mdp.load_policy(filename='scen1.p') q = QLearner(grid, mdp, moves=40) r = 0.0 for i in range(ITER): q.guide() r = r + q.get_reward() / (ITER) print "Value iter reward: " + str(r) value_iter_data[t,:] = np.zeros(ITER) + r r = 0.0 q.clear_states() mdp.pi = QPolicy(q) a = Analysis(W, H, ITER, rewards=rewards, sinks=sinks, desc='Q policy') for i in range(ITER * SAMP): q.rollout() r = r + q.get_reward() / (ITER * SAMP) print "Q learn reward: " + str(r) if t == 0: a.count_states(q.get_states()) a.show_states()