def main(): row, col = 50, 50 s_terminal = [0] # Goal to reach s_goal = [0] # Goal value r_goal = [0] # s_penalty = [0] # Goal to avoid - penalty block r_penalty = [0] # Penalty points prob = 0.1 #epsilon = 0.01 epsilon = 0.01 #gamma = 0.9 # Does not perform as good and stops at 129 gamma = 0.99 # Performs better at 250 r = -0.1 #r = -0.04 S=2500 #S=1000 r1=400 r2=1 p=.1 A = 2 max_iterations=5000000 #Set the environment #np.random.seed(1729) P, R = mdpTBEx.forest(S=S, r1=r1, r2=r2, p=p, is_sparse=False) print(P) print(R) q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [50000,50000000], row, col) draw_gridworld(q_utilitys, q_policys, iteration, ['Q','Q','Q','Q'], 'q-5B-forest.png',row, col)
def main(): row, col = 50, 50 s_terminal = [0] # Goal to reach s_goal = [0] # Goal value r_goal = [0] # s_penalty = [0] # Goal to avoid - penalty block r_penalty = [0] # Penalty points prob = 0.1 #epsilon = 0.01 epsilon = 0.01 #gamma = 0.9 # Does not perform as good and stops at 129 gamma = 0.99 # Performs better at 250 r = -0.1 #r = -0.04 S=2500 #S=1000 r1=400 r2=1 p=.1 A = 2 max_iterations=5000000 #Set the environment #np.random.seed(1729) P, R = mdpTBEx.forest(S=S, r1=r1, r2=r2, p=p, is_sparse=False) #print(P) #print(R) params_iterations(P, R, max_iterations, "Forest") vi_utilitys, vi_policys, vi_iterations, vi_runtimes = run_vi(P, R, gamma, [50,100,175,250], epsilon,row,col) draw_gridworld(vi_utilitys, vi_policys, vi_iterations, ['VI','VI','VI','VI'], '1.forest-vi-1.png',row, col) print("Value Iterations - Forest") for index, utility in enumerate(vi_utilitys): print(index, np.amax(utility), vi_iterations[index]) pi_utilitys, pi_policys, pi_iterations, pi_runtimes = run_pi(P, R, gamma, [5,20,50,75], epsilon, row, col) draw_gridworld(pi_utilitys, pi_policys, pi_iterations, ['PI','PI','PI','PI'], '2.forest-pi-1.png',row, col) print("Policy Iterations - Forest") for index, utility in enumerate(pi_utilitys): print(index, np.amax(pi_utilitys), pi_iterations[index]) q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [10000,50000,100000,500000], row, col) #q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [100,100,100,100], row, col) draw_gridworld(q_utilitys, q_policys, iteration, ['Q','Q','Q','Q'], '3.forest-q-1.png',row, col) print("Q learning Iterations - Forest") for index, utility in enumerate(pi_utilitys): print(index, np.amax(q_utilitys), iteration[index])
def test_qlearning_discounted_reward(discount_factor_range=(0.1, 0.3, 0.5, 0.9, 0.99), num_sim=50): dfs = [] for factor in discount_factor_range: series = [] for n in range(10000, 10000 + num_sim): P, R = forest(S=50, p=0.0, r1=50, r2=25) mdp = solve_mdp.solve_mdp_by_qlearning(P, R, discount=factor, max_iter=n) series.append(mdp) df = pd.concat(series, axis=1).T dfs.append(df) return pd.concat(dfs)
def test_discount_factor(discount_factor_range=(0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 0.99), num_sim=50): dfs = [] for factor in discount_factor_range: series = [] for n in range(1, num_sim + 1): P, R = forest(S=50) vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, discount=factor, max_iter=n) series.append(vi) df = pd.concat(series, axis=1).T dfs.append(df) return pd.concat(dfs)
def test_forest_age(forest_age_range=(3, 10, 50, 100), num_sim=50): dfs = [] for age in forest_age_range: series = [] for n in range(1, num_sim + 1): P, R = forest(S=age) vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, max_iter=n) series.append(vi) df = pd.concat(series, axis=1).T dfs.append(df) return pd.concat(dfs)
def test_qlearning_algorithm( forest_states_size=50, fire_prob=0.01, r1=50, r2=25, discount=0.9, num_sim_range=(10000, 10050), verbose=False ): P, R = forest(S=forest_states_size, r1=r1, r2=r2, p=fire_prob) min_value, max_value = num_sim_range series = [] for n in range(min_value, max_value): s = solve_mdp.solve_mdp_by_qlearning(P, R, discount=discount, max_iter=n, verbose=verbose) series.append(s) df = pd.concat(series, axis=1) return df.T
def test_fire_probability(fireprob_range=(0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99), num_sim=50): dfs = [] for factor in fireprob_range: series = [] for n in range(1, num_sim + 1): P, R = forest(S=50, p=factor) vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, max_iter=n) vi = vi.append(pd.Series(factor, index=["fire_probability"])) series.append(vi) df = pd.concat(series, axis=1).T dfs.append(df) return pd.concat(dfs)
def test_qlearning_deterministic(fireprob_range=(0.0, 0.1, 0.2, 0.5, 1.0), num_sim=50): dfs = [] for factor in fireprob_range: series = [] for n in range(10000, 10000 + num_sim): P, R = forest(S=50, p=factor, r1=50, r2=25) vi = solve_mdp.solve_mdp_by_qlearning(P, R, max_iter=n) vi = vi.append(pd.Series(factor, index=["fire_probability"])) series.append(vi) df = pd.concat(series, axis=1).T dfs.append(df) return pd.concat(dfs)
# Plot V over time avg_vs = [] for stat in q.run_stats: avg_v = stat['Mean V'] avg_vs.append(avg_v) plt.plot(avg_vs) plt.title("Average V Value Over Time") plt.xlabel("Iteration Number") plt.ylabel("Average V") plt.savefig("FL_large_q_conv.png") # endregion # Forest Management P, R = example.forest(S=10, r1=2, r2=1, p=.1) # region VI # avg V, n_iter, time ep_vals = [.1, .0001] gamma_vals = [.2, .5, .8, .95, .999] big_vs = [] big_n = [] big_t = [] big_p = [] for epsilon in ep_vals: avg_vs = [] n_iters = [] times = []
def learning_experiments(): policy_iteration_times = np.zeros((1000, 10)) n_iterations = np.zeros((1000, 10)) for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.PolicyIteration(P, R, gamma, max_iter=10000) pi.run() policy_iteration_times[states, i] = pi.time n_iterations[states, i] = pi.iter np.save(f'{PATH}/policy_iteration_times_forest.npy', policy_iteration_times) np.save(f'{PATH}/policy_iteration_n_iter_forest.npy', n_iterations) # In[96]: value_iteration_times = np.zeros((1000, 10, 10)) n_iterations = np.zeros((1000, 10, 10)) for j, epsilon in enumerate(np.linspace(0.1, 0.99, 10)): for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.ValueIteration(P, R, discount=gamma, max_iter=10000, epsilon=epsilon) pi.run() value_iteration_times[states, i, j] = pi.time n_iterations[states, i, j] = pi.iter np.save(f'{PATH}/value_iteration_times_forest.npy', value_iteration_times) np.save(f'{PATH}/value_iteration_n_iter_forest.npy', n_iterations) # In[108]: Q_iteration_times = np.zeros((1000, 10)) n_iterations = np.zeros((1000, 10)) for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.QLearning(P, R, discount=gamma, n_iter=10000) pi.run() Q_iteration_times[states, i] = pi.time n_iterations[states, i] = pi.mean_discrepancy np.save(f'{PATH}/Q_iteration_times_forest.npy', Q_iteration_times) np.save(f'{PATH}/Q_iteration_n_iter_forest.npy', n_iterations) # ## MDP 2: FrozenLake # In[98]: # In[109]: from gym.envs.toy_text.frozen_lake import generate_random_map Q_iteration_times = np.zeros((100, 10, 10)) Q_rewards = np.zeros((100, 10, 10)) value_n_iterations = np.zeros((100, 10, 10)) policy_n_iterations = np.zeros((100, 10, 10)) total_states = np.zeros(100) for size in range(2, 100, 5): for i, gamma in enumerate(np.linspace(0, 1, 10)): for j, epsilon in enumerate(np.linspace(0, 1, 10)): random_map = generate_random_map(size=size, p=0.8) environment = gym.make('FrozenLake-v0', desc=random_map) test = QLearner(0.1, gamma, epsilon, verbose=False) start = time.time() n = test.learn(50) Q_iteration_times[size, i, j] = time.time() - start Q_rewards[size, i, j] = n[-1] np.save(f'{PATH}/Q_iteration_times_grid.npy', Q_iteration_times) np.save(f'{PATH}/Q_iteration_rewards_grid.npy', Q_rewards) # In[106]: value_iteration_times = np.zeros((100, 10)) policy_iteration_times = np.zeros((100, 10)) value_n_iterations = np.zeros((100, 10)) policy_n_iterations = np.zeros((100, 10)) total_states = np.zeros(100) for size in range(2, 100, 5): for i, gamma in enumerate(np.linspace(0, 1, 10)): random_map = generate_random_map(size=size, p=0.8) environment = gym.make('FrozenLake-v0', desc=random_map) total_states[size] = environment.nS agent = BasicLearner(environment, environment.nS, environment.nA, 5000, gamma) start = time.time() opt_v2, opt_policy2, value_iter = agent.value_iteration() value_iteration_times[size, i] = time.time() - start value_n_iterations[size, i] = value_iter start = time.time() opt_v2, opt_policy2, policy_iter = agent.policy_iteration() policy_iteration_times[size, i] = time.time() - start policy_n_iterations[size, i] = policy_iter np.save(f'{PATH}/num_states_grid.npy', total_states) np.save(f'{PATH}/policy_iteration_times_grid.npy', policy_iteration_times) np.save(f'{PATH}/value_iteration_times_grid.npy', value_iteration_times) np.save(f'{PATH}/value_iteration_n_iter_grid.npy', value_n_iterations) np.save(f'{PATH}/policy_iteration_n_iter_grid.npy', policy_n_iterations)
# Just dump policy f.write("Policy is:\n" + str(alg.policy) + "\n") if mdp is "grid": # Dump reshaped policy and simulated rewards reshaped_policy = visualize_policy(alg.policy, dim) simulated_rewards = get_reward(P, R, alg.policy, 10) f.write("Policy is:\n" + str(reshaped_policy) + "\n") f.write("Simulated rewards are:" + str(simulated_rewards) + "\n") f.write("***End of " + vi_pi + " with Gamma=" + str(gamma) + "***\n\n") if run_forest: for dim in forest_dims: P, R = forest(dim, 4, 1, 0.4, is_sparse=False) prob_str = str(dim) + '_' + str(prob_fire) run_gamma_sweep("forest", "vi", prob_str, P, R, gammas, dim) run_gamma_sweep("forest", "pi", prob_str, P, R, gammas, dim) if run_grid: for dim in grid_dims: P, R = grid_world(X=dim, Y=dim, prob_desired_move=prob_desired_move, prob_bad_state=prob_bad_state) prob_str = str(dim) + 'x' + str(dim) + '_' + str( prob_desired_move) + '_' + str(prob_bad_state) run_gamma_sweep("grid", "vi", prob_str, P, R, gammas, dim) run_gamma_sweep("grid", "pi", prob_str, P, R, gammas, dim)
def solve_forest_example(forest_states_size=50, r1=50, r2=25, fire_prob=0.1, num_simulations=50, discount=0.9): P, R = forest(S=forest_states_size, r1=r1, r2=r2, p=fire_prob) vi = solve_mdp.test_algorithm(ValueIteration, P, R, discount=discount, num_sim=num_simulations) pi = solve_mdp.test_algorithm(PolicyIteration, P, R, discount=discount, num_sim=num_simulations) df = pd.concat([vi, pi]) return df
gamma = 0.5 dims = [20, 50, 100, 500, 1000, 5000, 10000] # Output files for num_states in dims: forest_pi_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_pi.csv' forest_vi_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_vi.csv' forest_ql_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_ql.csv' forest_summary_file = 'output/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_summary.rpt' # Sets up consistent seed np.random.seed(0) # MDP Trans_Prob, Rewards = forest(num_states, reward_wait, reward_cut, prob_fire, is_sparse=True) # Value Iteration # Convergence is based off of the change in value function # V - V_prev, then do max value - min value (error) at which point if that is less than some threshold, then converged if run_vi: print("Running Value Iteration ...") vi = ValueIteration(Trans_Prob, Rewards, gamma) vi_stats = vi.run() vi_df = pd.DataFrame(vi_stats) vi_df.to_csv(forest_vi_stats_file, index_label="Iteration") with open(forest_summary_file, 'w') as f: f.write("***Value Iteration***\n") f.write("Num iters: "+str(vi.iter)+"\nRuntime: "+str(vi.time)) f.write("Optimal value function:\n"+str(vi.V)+"\n") f.write("Optimal policy :\n"+str(vi.policy)+"\n")
# Epsilon files epsilon_low_file = base_path+'epsilon_0.1_no_decay.csv' epsilon_high_file = base_path+'epsilon_0.9_no_decay.csv' epsilon_decay_file = base_path+'epsilon_1.0_decay.csv' epsilon_sweep_file = base_sweep_path+'epsilon_sweep.rpt' # Gamma files gamma_low_file = base_path+'gamma_0.1.csv' gamma_med_file = base_path+'gamma_0.5.csv' gamma_high_file = base_path+'gamma_0.9.csv' gamma_sweep_file = base_sweep_path+'gamma_sweep.rpt' # Build world Trans_Prob, Rewards = forest(dim, 4, 1, 0.4, is_sparse=False) if run_alpha_sweep: # Low ql = QLearning(Trans_Prob, Rewards, 0.9, n_iter=ql_iter, alpha=0.001, alpha_decay=1.00) ql_stats = ql.run() ql_df = pd.DataFrame(ql_stats) ql_df.to_csv(alpha_low_file, index_label="Iteration") with open(alpha_sweep_file, 'a') as f: f.write("***Alpha = 0.001 with No Decay***\n") f.write("Policy is:\n"+str(ql.policy)+"\n") f.write("***End of Alpha = 0.001 with No Decay***\n\n") # High ql = QLearning(Trans_Prob, Rewards, 0.9, n_iter=ql_iter, alpha=0.5, alpha_decay=1.00) ql_stats = ql.run()
pgrad = policy_gradient(theta, mu , paths_states, paths_actions, paths_rewards) #mu_diff = np.linalg.norm(mu_policy(theta) - mu_policy(theta+pgrad)) theta_diff_norm = np.linalg.norm(theta_diff) n = n+1 return theta n_states = 5 n_actions = 2 fire_prob = 0.1 discount=0.9 n_paths=100 path_len=100 path_len = 10 path_num = 10 P, R = mdp_ex.forest(S=n_states, p=fire_prob) #P是转移矩阵,大小是action*state*state,(a,i,j)的意思是在状态i下采用a转移到j状态的概率 #R是reward矩阵,大小是action*state,(a,i)的意思是在状态i下瓷用a得到的reward是多少 pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=discount) pi.policy0=[1,1,1,1,1] #vi.setVerbose() pi.run() policy_pi = pi.policy print "Optimal policy (policy iteration) : \n" , policy_pi policy_pg = policy_gradient_algo( P, R , discount , path_len , n_paths, gamma=10 , eps=0.01)