def run_maze(maze, title=""): T = maze.get_transitions() R = maze.get_rewards() discount = 0.90 value_iteration = ValueIteration(T, R, discount) value_iteration.run() print "VITER REWARD", maze.find_reward(value_iteration.policy) print "VITER TIME", value_iteration.time print "VITER ITERS", value_iteration.iter maze.draw_maze(value_iteration.policy, title=title+"v") policy_iteration = PolicyIteration(T,R, discount) policy_iteration.run() print "PITER REWARD", maze.find_reward(policy_iteration.policy) print "PITER TIME", policy_iteration.time print "PITER ITERS", policy_iteration.iter maze.draw_maze(policy_iteration.policy, title=title+'p') s = time.time() Q = maze.qlearn() n = time.time() q_policy = [] for state in Q: q_policy.append(np.argmax(state)) maze.draw_maze(q_policy, title=title+'q') print "Q LEARN", maze.find_reward(q_policy) print "Q LEARN TIME", (n-s) print "Q ITERS", maze.q_iters
def valueIteration(self): policy_filename = 'policy_npursuers_%d_seed_%d_nrows_%d_ncols_%d_empty_%s_irrationalpursuer.pkl' % ( self.num_pursuers, self.seed, self.nrows, self.ncols, self.empty) if os.path.exists(policy_filename): with open(policy_filename, 'rb') as policy_file: policy = pickle.load(policy_file) return policy transitions, rewards = self.compute_alltransitions_reward() valueIterationMDP = ValueIteration(transitions, rewards, 0.99, skip_check=True) valueIterationMDP.run() with open(policy_filename, 'wb') as policy_file: pickle.dump(valueIterationMDP.policy, policy_file) return valueIterationMDP.policy
def run_gamma_sweep(mdp, vi_pi, prob_str, P, R, gammas, dim): if mdp is "forest": pass elif mdp is "grid": pass else: print("ERROR: Need forest|grid for mdp. Passed: ", mdp) exit(1) if vi_pi is "vi": pass elif vi_pi is "pi": pass else: print("ERROR: Need vi|pi for vi_pi. Passed: ", vi_pi) exit base_path = './output/csv/' + mdp + '_' + prob_str + '_' + vi_pi + '_' base_sweep_path = './output/' + mdp + '_' + prob_str + '_' gamma_sweep_file = base_sweep_path + 'gamma_sweep.rpt' if mdp is "grid": gw = visualize_grid_world(R[:, 0], dim, dim) with open(gamma_sweep_file, 'a') as f: f.write("Grid World is:\n" + str(gw) + "\n\n") for gamma in gammas: gamma_stats_file = base_path + 'gamma_' + str(gamma) + '.csv' print("Running Value Iteration with gamma", gamma) if vi_pi is "vi": alg = ValueIteration(P, R, gamma) elif vi_pi is "pi": alg = PolicyIteration(P, R, gamma) stats = alg.run() df = pd.DataFrame(stats) df.to_csv(gamma_stats_file, index_label="Iteration") print("Value Iteration complete.") print("Optimal value function: ", alg.V) print("Optimal policy: ", alg.policy) with open(gamma_sweep_file, 'a') as f: f.write("***" + vi_pi + " with Gamma=" + str(gamma) + "***\n") if mdp is "forest": # Just dump policy f.write("Policy is:\n" + str(alg.policy) + "\n") if mdp is "grid": # Dump reshaped policy and simulated rewards reshaped_policy = visualize_policy(alg.policy, dim) simulated_rewards = get_reward(P, R, alg.policy, 10) f.write("Policy is:\n" + str(reshaped_policy) + "\n") f.write("Simulated rewards are:" + str(simulated_rewards) + "\n") f.write("***End of " + vi_pi + " with Gamma=" + str(gamma) + "***\n\n")
# Build world Trans_Prob, Rewards = grid_world(X=dim, Y=dim, prob_desired_move=prob_desired, prob_bad_state=prob_bad_state, is_sparse=sparse) gw = visualize_grid_world(Rewards[:, 0], dim, dim) print("Grid world is: ") print(gw) with open(summary_file, out_type) as f: f.write("Grid world is:\n") f.write(str(gw) + "\n\n") if run_vi: vi = ValueIteration(Trans_Prob, Rewards, 0.9) vi_stats = vi.run() vi_df = pd.DataFrame(vi_stats) vi_df.to_csv(vi_stats_file, index_label="Iteration") reshaped_value_function = np.reshape(vi.V, (dim, dim)) reshaped_policy = visualize_policy(vi.policy, dim) simulated_rewards = get_reward(Trans_Prob, Rewards, vi.policy, 10, dim, sparse) print("VI: Performed ", vi.iter, " iterations in ", vi.time, " and got rewards of: ", simulated_rewards) with open(summary_file, 'a') as f: f.write("***Value Iteration Section***\n") f.write("Iterations: " + str(vi.iter) + "\n") f.write("Runtime: " + str(vi.time) + "\n") f.write("Value function:\n") f.write(str(reshaped_value_function)) f.write("\nPolicy:\n")
from mdptoolbox.example import rand from mdptoolbox.mdp import ValueIteration from src.KronMDP import multiagent, multiagent_full, KronValueIteration from timeit import default_timer as timer from functools import reduce RUNBIG = False RUNKRON = True RUNFULL = False # large example with memory problems - python cannot create example if RUNBIG: P, R = rand(10, 2) vi = ValueIteration(P, R, 0.95) vi.run() # kron example (not as dense) if RUNKRON: Ps, R = multiagent(S=10, N=5) start = timer() vi = KronValueIteration(Ps, R, 0.95, skip_check=True) vi.run() end = timer() print("kronecker method took", end - start, "seconds") # compare with fully computed example if RUNFULL: P, R = multiagent_full(S=10, N=2) start = timer() vi = ValueIteration(P, R, 0.95) vi.run()
def forest_experiment(): P, R = mdptoolbox.example.forest(S=1250, r1=500, r2=250) value = [] policy = [] iters = [] time_ = [] gamma = [] rewards_p = [] rewards_v = [] time_p = [] time_v = [] iters_p = [] iters_v = [] rewards_q = [] time_q = [] iters_q = [] mean_discrep = [] env2 = gym.make('FrozenLake-v0') q_table = [] value_q = [] policy_q = [] iters_q = [] time_q_arr = [] gamma_q = [] q_vals = [] q_rewards = [] mean_discrep = [] for i in range(0, 10): start = time.time() q_policy = mdptoolbox.mdp.QLearning(P, R, 0.8) time_q = time.time() - start q_policy.run() q_rewards.append(np.mean(q_policy.V)) value_q.append(np.mean(q_policy.V)) policy_q.append(q_policy.policy) gamma_q.append((i + 0.5) / 10) q_vals.append(q_policy.Q) mean_discrep.append(q_policy.mean_discrepancy) # iters_q.append(q_policy.n_iters) time_q_arr.append(time_q) plt.plot(gamma_q, mean_discrep, label='Q-Learning') plt.xlabel('Gammas') plt.title('Q-Learning Mean Discrepancy') plt.ylabel('Mean Discrepancy') plt.grid() plt.show() for size in [1250]: P, R = mdptoolbox.example.forest(S=size) forest_policy_p = PolicyIteration(P, R, 0.99) forest_policy_v = ValueIteration(P, R, 0.99) forest_policy_q = QLearning(P, R, 0.1) forest_policy_p.run() forest_policy_v.run() forest_policy_q.run() rewards_p.append(np.mean(forest_policy_p.V)) rewards_v.append(np.mean(forest_policy_v.V)) rewards_q.append(np.mean(forest_policy_q.V)) time_p.append(forest_policy_p.time) time_v.append(forest_policy_v.time) #time_q.append(forest_policy_q.time) iters_p.append(forest_policy_p.iter) iters_v.append(forest_policy_v.iter) #iters_q.append(forest_policy_q.iter) #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_v, label='Value Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], rewards_q, label='Q-Learning') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Rewards vs State Size') #plt.ylabel('Average Rewards') #plt.legend() #plt.show() #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_v, label='Value Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], time_q, label='Q-Learning') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Computation Time vs State Size') #plt.ylabel('Computation Time') #plt.legend() #plt.show() #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_p, label='Policy Iteration') #plt.plot([1250, 1500, 1750, 2000, 2250, 2500], iters_v, label='Value Iteration') #plt.grid() #plt.xlabel('State Size') #plt.title('Forest Management - Convergence vs State Size') #plt.ylabel('Iterations') #plt.legend() #plt.show() value_vi = [] policy_vi = [] iters_vi = [] time_vi = [] gamma_vi = [] mean_discrep_p = [] for i in range(0, 10): forest_policy = PolicyIteration(P, R, (i+0.5)/10) forest_policy.run() gamma.append((i+0.5)/10) plt.imshow(np.atleast_2d(forest_policy.policy)) time_.append(forest_policy.time) policy.append(forest_policy.policy) iters.append(forest_policy.iter) value.append(np.mean(forest_policy.V)) for i in range(0, 10): forest_policy = ValueIteration(P, R, (i+0.5)/10) forest_policy.run() gamma_vi.append((i+0.5)/10) time_vi.append(forest_policy.time) policy_vi.append(forest_policy.policy) iters_vi.append(forest_policy.iter) value_vi.append(np.mean(forest_policy.V)) #P, R = mdptoolbox.example.forest(S=1250, p=0.1) value_q = [] policy_q = [] iters_q = [] time_q_arr = [] gamma_q = [] q_vals = [] q_rewards = [] mean_discrep = [] env2 = gym.make('FrozenLake-v0') q_table = [] for i in range(0, 10): start = time.time() q_policy = mdptoolbox.mdp.QLearning(P,R, 0.1) time_q = time.time() - start q_policy.run() q_rewards.append(np.mean(q_policy.V)) value_q.append(np.mean(q_policy.V)) policy_q.append(q_policy.policy) gamma_q.append((i+0.5)/10) q_vals.append(q_policy.Q) mean_discrep.append(q_policy.mean_discrepancy) #iters_q.append(q_policy.n_iters) time_q_arr.append(time_q) plt.plot(gamma, time_, label='Policy Iteration') plt.plot(gamma_vi, time_vi, label='Value Iteration') plt.plot(gamma_q, time_q_arr, label='Q-Learning') plt.xlabel('Gammas') plt.title('Forest Management - Computation Time - Policy Iteration vs Value Iteration vs Q-Learning') plt.ylabel('Computation Time') plt.grid() plt.legend() plt.show() plt.plot(gamma, value, label='Policy Iteration') plt.plot(gamma_vi, value_vi, label='Value Iteration') plt.plot(gamma_q, q_rewards, label='Q-Learning') plt.xlabel('Gammas') plt.title('Average Rewards - Policy Iteration vs Value Iteration vs Q-Learning') plt.ylabel('Average Rewards') plt.grid() plt.legend() plt.show() plt.plot(gamma, iters, label="Policy Iteration") plt.plot(gamma_vi, iters_vi, label="Value Iteration") #plt.plot(gamma_q, iters_q, label="Q-Learning") plt.xlabel('Gammas') plt.title('Iterations to Converge - Policy Iteration vs Value Iteration') plt.ylabel('Iterations') plt.grid() plt.legend() plt.show()
eghls = [] ets = [] bestgoal = 0 bestpolicy = None bestpolicyV = None bestpolicyparams = {} print("Running ...") for epsilon in epsilons: iters = [] ghls = [] ts = [] for gamma in gammas: #print("gamma: %.1f, epsilon: %s" % (gamma, str(epsilon))) func = ValueIteration(P, R, gamma, max_iter=maxiter, epsilon=epsilon) func.run() #print("best policy:") #common.printPolicy(env, func.policy, actions) timesteps, gtimesteps, ghl = common.runPolicy(env, episodes, func.policy) if ghl[0] > bestgoal: bestgoal = ghl[0] bestpolicy = func.policy bestpolicyV = func.V bestpolicyparams['gamma'] = gamma bestpolicyparams['epsilon'] = epsilon bestpolicyparams['iterations'] = func.iter bestpolicyparams['elapsedtime'] = func.time bestpolicyparams['meangtimesteps'] = np.mean(gtimesteps) iters.append(func.iter) ghls.append(ghl)
def example(): """Run the MDP Toolbox forest example.""" transitions, rewards = mdptoolbox.example.forest() viter = ValueIteration(transitions, rewards, 0.9) viter.run() print viter.policy