def main():        
    row, col = 50, 50 
    s_terminal = [0]               # Goal to reach
    s_goal = [0]                   # Goal value
    r_goal = [0]                   # 
    s_penalty = [0]                # Goal to avoid - penalty block
    r_penalty = [0]               # Penalty points
    prob = 0.1
    #epsilon = 0.01
    epsilon = 0.01
    #gamma = 0.9    # Does not perform as good and stops at 129
    gamma = 0.99    # Performs better at 250
    r = -0.1
    #r = -0.04
    S=2500
    #S=1000
    r1=400
    r2=1
    p=.1
    A = 2
    max_iterations=5000000
    #Set the environment
    #np.random.seed(1729)
    P, R = mdpTBEx.forest(S=S, r1=r1, r2=r2, p=p, is_sparse=False)
    print(P)
    print(R)
    
    q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [50000,50000000], row, col)
    draw_gridworld(q_utilitys, q_policys, iteration, ['Q','Q','Q','Q'], 'q-5B-forest.png',row, col)
示例#2
0
def main():        
    row, col = 50, 50 
    s_terminal = [0]               # Goal to reach
    s_goal = [0]                   # Goal value
    r_goal = [0]                   # 
    s_penalty = [0]                # Goal to avoid - penalty block
    r_penalty = [0]               # Penalty points
    prob = 0.1
    #epsilon = 0.01
    epsilon = 0.01
    #gamma = 0.9    # Does not perform as good and stops at 129
    gamma = 0.99    # Performs better at 250
    r = -0.1
    #r = -0.04
    S=2500
    #S=1000
    r1=400
    r2=1
    p=.1
    A = 2
    max_iterations=5000000
    #Set the environment
    #np.random.seed(1729)
    P, R = mdpTBEx.forest(S=S, r1=r1, r2=r2, p=p, is_sparse=False)
    #print(P)
    #print(R)
    
    params_iterations(P, R, max_iterations, "Forest")
    
    vi_utilitys, vi_policys, vi_iterations, vi_runtimes = run_vi(P, R, gamma, 
                                                                  [50,100,175,250],
                                                                  epsilon,row,col)
    
    draw_gridworld(vi_utilitys, vi_policys, vi_iterations, ['VI','VI','VI','VI'], '1.forest-vi-1.png',row, col)
    
        
    print("Value Iterations - Forest")
    for index, utility in enumerate(vi_utilitys):
        print(index,  np.amax(utility), vi_iterations[index])
    
    
    pi_utilitys, pi_policys, pi_iterations, pi_runtimes = run_pi(P, R, gamma, [5,20,50,75], epsilon, row, col)
    draw_gridworld(pi_utilitys, pi_policys, pi_iterations, ['PI','PI','PI','PI'], '2.forest-pi-1.png',row, col)
    
    print("Policy Iterations - Forest")
    for index, utility in enumerate(pi_utilitys):
        print(index,  np.amax(pi_utilitys), pi_iterations[index])

    q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [10000,50000,100000,500000], row, col)
    #q_utilitys, q_policys, iteration, q_runtimes = run_qlearning(P, R, gamma, [100,100,100,100], row, col)
    draw_gridworld(q_utilitys, q_policys, iteration, ['Q','Q','Q','Q'], '3.forest-q-1.png',row, col)
    
    print("Q learning Iterations - Forest")
    for index, utility in enumerate(pi_utilitys):
        print(index,  np.amax(q_utilitys), iteration[index])
示例#3
0
def test_qlearning_discounted_reward(discount_factor_range=(0.1, 0.3, 0.5, 0.9, 0.99), num_sim=50):
    dfs = []
    for factor in discount_factor_range:
        series = []
        for n in range(10000, 10000 + num_sim):
            P, R = forest(S=50, p=0.0, r1=50, r2=25)
            mdp = solve_mdp.solve_mdp_by_qlearning(P, R, discount=factor, max_iter=n)
            series.append(mdp)
        df = pd.concat(series, axis=1).T
        dfs.append(df)
    return pd.concat(dfs)
示例#4
0
def test_discount_factor(discount_factor_range=(0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 0.99), num_sim=50):
    dfs = []
    for factor in discount_factor_range:
        series = []
        for n in range(1, num_sim + 1):
            P, R = forest(S=50)
            vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, discount=factor, max_iter=n)
            series.append(vi)
        df = pd.concat(series, axis=1).T
        dfs.append(df)
    return pd.concat(dfs)
示例#5
0
def test_forest_age(forest_age_range=(3, 10, 50, 100), num_sim=50):
    dfs = []
    for age in forest_age_range:
        series = []
        for n in range(1, num_sim + 1):
            P, R = forest(S=age)
            vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, max_iter=n)
            series.append(vi)
        df = pd.concat(series, axis=1).T
        dfs.append(df)
    return pd.concat(dfs)
示例#6
0
def test_qlearning_algorithm(
    forest_states_size=50, fire_prob=0.01, r1=50, r2=25, discount=0.9, num_sim_range=(10000, 10050), verbose=False
):
    P, R = forest(S=forest_states_size, r1=r1, r2=r2, p=fire_prob)
    min_value, max_value = num_sim_range
    series = []
    for n in range(min_value, max_value):
        s = solve_mdp.solve_mdp_by_qlearning(P, R, discount=discount, max_iter=n, verbose=verbose)
        series.append(s)
    df = pd.concat(series, axis=1)
    return df.T
示例#7
0
def test_fire_probability(fireprob_range=(0.01, 0.1, 0.2, 0.5, 0.8, 0.9, 0.99), num_sim=50):
    dfs = []
    for factor in fireprob_range:
        series = []
        for n in range(1, num_sim + 1):
            P, R = forest(S=50, p=factor)
            vi = solve_mdp.solve_mdp_by_iteration(ValueIteration, P, R, max_iter=n)
            vi = vi.append(pd.Series(factor, index=["fire_probability"]))
            series.append(vi)
        df = pd.concat(series, axis=1).T
        dfs.append(df)
    return pd.concat(dfs)
示例#8
0
def test_qlearning_deterministic(fireprob_range=(0.0, 0.1, 0.2, 0.5, 1.0), num_sim=50):
    dfs = []
    for factor in fireprob_range:
        series = []
        for n in range(10000, 10000 + num_sim):
            P, R = forest(S=50, p=factor, r1=50, r2=25)
            vi = solve_mdp.solve_mdp_by_qlearning(P, R, max_iter=n)
            vi = vi.append(pd.Series(factor, index=["fire_probability"]))
            series.append(vi)
        df = pd.concat(series, axis=1).T
        dfs.append(df)
    return pd.concat(dfs)
示例#9
0
# Plot V over time
avg_vs = []
for stat in q.run_stats:
    avg_v = stat['Mean V']
    avg_vs.append(avg_v)
plt.plot(avg_vs)
plt.title("Average V Value Over Time")
plt.xlabel("Iteration Number")
plt.ylabel("Average V")
plt.savefig("FL_large_q_conv.png")

# endregion

# Forest Management

P, R = example.forest(S=10, r1=2, r2=1, p=.1)

# region VI

# avg V, n_iter, time
ep_vals = [.1, .0001]
gamma_vals = [.2, .5, .8, .95, .999]

big_vs = []
big_n = []
big_t = []
big_p = []
for epsilon in ep_vals:
    avg_vs = []
    n_iters = []
    times = []
示例#10
0
def learning_experiments():
    policy_iteration_times = np.zeros((1000, 10))
    n_iterations = np.zeros((1000, 10))
    for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
        for states in range(2, 1000):
            P, R = example.forest(S=states)
            pi = mdp.mdp.PolicyIteration(P, R, gamma, max_iter=10000)
            pi.run()
            policy_iteration_times[states, i] = pi.time
            n_iterations[states, i] = pi.iter

    np.save(f'{PATH}/policy_iteration_times_forest.npy',
            policy_iteration_times)
    np.save(f'{PATH}/policy_iteration_n_iter_forest.npy', n_iterations)

    # In[96]:

    value_iteration_times = np.zeros((1000, 10, 10))
    n_iterations = np.zeros((1000, 10, 10))
    for j, epsilon in enumerate(np.linspace(0.1, 0.99, 10)):
        for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
            for states in range(2, 1000):
                P, R = example.forest(S=states)
                pi = mdp.mdp.ValueIteration(P,
                                            R,
                                            discount=gamma,
                                            max_iter=10000,
                                            epsilon=epsilon)
                pi.run()
                value_iteration_times[states, i, j] = pi.time
                n_iterations[states, i, j] = pi.iter

    np.save(f'{PATH}/value_iteration_times_forest.npy', value_iteration_times)
    np.save(f'{PATH}/value_iteration_n_iter_forest.npy', n_iterations)

    # In[108]:

    Q_iteration_times = np.zeros((1000, 10))
    n_iterations = np.zeros((1000, 10))
    for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
        for states in range(2, 1000):
            P, R = example.forest(S=states)
            pi = mdp.mdp.QLearning(P, R, discount=gamma, n_iter=10000)
            pi.run()
            Q_iteration_times[states, i] = pi.time
            n_iterations[states, i] = pi.mean_discrepancy

    np.save(f'{PATH}/Q_iteration_times_forest.npy', Q_iteration_times)
    np.save(f'{PATH}/Q_iteration_n_iter_forest.npy', n_iterations)

    # ## MDP 2: FrozenLake

    # In[98]:
    # In[109]:

    from gym.envs.toy_text.frozen_lake import generate_random_map

    Q_iteration_times = np.zeros((100, 10, 10))
    Q_rewards = np.zeros((100, 10, 10))

    value_n_iterations = np.zeros((100, 10, 10))
    policy_n_iterations = np.zeros((100, 10, 10))
    total_states = np.zeros(100)
    for size in range(2, 100, 5):
        for i, gamma in enumerate(np.linspace(0, 1, 10)):
            for j, epsilon in enumerate(np.linspace(0, 1, 10)):
                random_map = generate_random_map(size=size, p=0.8)
                environment = gym.make('FrozenLake-v0', desc=random_map)
                test = QLearner(0.1, gamma, epsilon, verbose=False)
                start = time.time()
                n = test.learn(50)
                Q_iteration_times[size, i, j] = time.time() - start
                Q_rewards[size, i, j] = n[-1]

    np.save(f'{PATH}/Q_iteration_times_grid.npy', Q_iteration_times)
    np.save(f'{PATH}/Q_iteration_rewards_grid.npy', Q_rewards)

    # In[106]:

    value_iteration_times = np.zeros((100, 10))
    policy_iteration_times = np.zeros((100, 10))

    value_n_iterations = np.zeros((100, 10))
    policy_n_iterations = np.zeros((100, 10))
    total_states = np.zeros(100)
    for size in range(2, 100, 5):
        for i, gamma in enumerate(np.linspace(0, 1, 10)):
            random_map = generate_random_map(size=size, p=0.8)
            environment = gym.make('FrozenLake-v0', desc=random_map)
            total_states[size] = environment.nS
            agent = BasicLearner(environment, environment.nS, environment.nA,
                                 5000, gamma)
            start = time.time()
            opt_v2, opt_policy2, value_iter = agent.value_iteration()
            value_iteration_times[size, i] = time.time() - start
            value_n_iterations[size, i] = value_iter

            start = time.time()
            opt_v2, opt_policy2, policy_iter = agent.policy_iteration()
            policy_iteration_times[size, i] = time.time() - start
            policy_n_iterations[size, i] = policy_iter

    np.save(f'{PATH}/num_states_grid.npy', total_states)
    np.save(f'{PATH}/policy_iteration_times_grid.npy', policy_iteration_times)
    np.save(f'{PATH}/value_iteration_times_grid.npy', value_iteration_times)
    np.save(f'{PATH}/value_iteration_n_iter_grid.npy', value_n_iterations)
    np.save(f'{PATH}/policy_iteration_n_iter_grid.npy', policy_n_iterations)
示例#11
0
                # Just dump policy
                f.write("Policy is:\n" + str(alg.policy) + "\n")
            if mdp is "grid":
                # Dump reshaped policy and simulated rewards
                reshaped_policy = visualize_policy(alg.policy, dim)
                simulated_rewards = get_reward(P, R, alg.policy, 10)
                f.write("Policy is:\n" + str(reshaped_policy) + "\n")
                f.write("Simulated rewards are:" + str(simulated_rewards) +
                        "\n")
            f.write("***End of " + vi_pi + " with Gamma=" + str(gamma) +
                    "***\n\n")


if run_forest:
    for dim in forest_dims:
        P, R = forest(dim, 4, 1, 0.4, is_sparse=False)
        prob_str = str(dim) + '_' + str(prob_fire)
        run_gamma_sweep("forest", "vi", prob_str, P, R, gammas, dim)
        run_gamma_sweep("forest", "pi", prob_str, P, R, gammas, dim)

if run_grid:
    for dim in grid_dims:
        P, R = grid_world(X=dim,
                          Y=dim,
                          prob_desired_move=prob_desired_move,
                          prob_bad_state=prob_bad_state)
        prob_str = str(dim) + 'x' + str(dim) + '_' + str(
            prob_desired_move) + '_' + str(prob_bad_state)
        run_gamma_sweep("grid", "vi", prob_str, P, R, gammas, dim)
        run_gamma_sweep("grid", "pi", prob_str, P, R, gammas, dim)
示例#12
0
def solve_forest_example(forest_states_size=50, r1=50, r2=25, fire_prob=0.1, num_simulations=50, discount=0.9):
    P, R = forest(S=forest_states_size, r1=r1, r2=r2, p=fire_prob)
    vi = solve_mdp.test_algorithm(ValueIteration, P, R, discount=discount, num_sim=num_simulations)
    pi = solve_mdp.test_algorithm(PolicyIteration, P, R, discount=discount, num_sim=num_simulations)
    df = pd.concat([vi, pi])
    return df
示例#13
0
gamma = 0.5

dims = [20, 50, 100, 500, 1000, 5000, 10000]

# Output files
for num_states in dims:
	forest_pi_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_pi.csv'
	forest_vi_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_vi.csv'
	forest_ql_stats_file = 'output/csv/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_stats_ql.csv'
	forest_summary_file = 'output/forest_'+str(num_states)+'_'+str(prob_fire)+'_gamma='+str(gamma)+'_summary.rpt'

	# Sets up consistent seed
	np.random.seed(0)

	# MDP
	Trans_Prob, Rewards = forest(num_states, reward_wait, reward_cut, prob_fire, is_sparse=True)

	# Value Iteration
	# Convergence is based off of the change in value function
	# V - V_prev, then do max value - min value (error) at which point if that is less than some threshold, then converged
	if run_vi:
		print("Running Value Iteration ...")
		vi = ValueIteration(Trans_Prob, Rewards, gamma)
		vi_stats = vi.run()
		vi_df = pd.DataFrame(vi_stats)
		vi_df.to_csv(forest_vi_stats_file, index_label="Iteration")
		with open(forest_summary_file, 'w') as f:
			f.write("***Value Iteration***\n")
			f.write("Num iters: "+str(vi.iter)+"\nRuntime: "+str(vi.time))
			f.write("Optimal value function:\n"+str(vi.V)+"\n")
			f.write("Optimal policy :\n"+str(vi.policy)+"\n")
	# Epsilon files
	epsilon_low_file = base_path+'epsilon_0.1_no_decay.csv'
	epsilon_high_file = base_path+'epsilon_0.9_no_decay.csv'
	epsilon_decay_file = base_path+'epsilon_1.0_decay.csv'
	epsilon_sweep_file = base_sweep_path+'epsilon_sweep.rpt'

	# Gamma files
	gamma_low_file = base_path+'gamma_0.1.csv'
	gamma_med_file = base_path+'gamma_0.5.csv'
	gamma_high_file = base_path+'gamma_0.9.csv'
	gamma_sweep_file = base_sweep_path+'gamma_sweep.rpt'



	# Build world
	Trans_Prob, Rewards = forest(dim, 4, 1, 0.4, is_sparse=False)

	if run_alpha_sweep:
		# Low
		ql = QLearning(Trans_Prob, Rewards, 0.9, n_iter=ql_iter, alpha=0.001, alpha_decay=1.00)
		ql_stats = ql.run()
		ql_df = pd.DataFrame(ql_stats)
		ql_df.to_csv(alpha_low_file, index_label="Iteration")
		with open(alpha_sweep_file, 'a') as f:
			f.write("***Alpha = 0.001 with No Decay***\n")
			f.write("Policy is:\n"+str(ql.policy)+"\n")
			f.write("***End of Alpha = 0.001 with No Decay***\n\n")

		# High
		ql = QLearning(Trans_Prob, Rewards, 0.9, n_iter=ql_iter, alpha=0.5, alpha_decay=1.00)
		ql_stats = ql.run()
        pgrad = policy_gradient(theta, mu , paths_states, paths_actions, paths_rewards)
        #mu_diff = np.linalg.norm(mu_policy(theta) - mu_policy(theta+pgrad))
        theta_diff_norm = np.linalg.norm(theta_diff)
        n = n+1
    return theta

n_states = 5
n_actions = 2
fire_prob = 0.1
discount=0.9
n_paths=100
path_len=100
path_len = 10
path_num = 10

P, R = mdp_ex.forest(S=n_states, p=fire_prob)
#P是转移矩阵,大小是action*state*state,(a,i,j)的意思是在状态i下采用a转移到j状态的概率
#R是reward矩阵,大小是action*state,(a,i)的意思是在状态i下瓷用a得到的reward是多少


pi = mdptoolbox.mdp.PolicyIteration(P, R, discount=discount)
pi.policy0=[1,1,1,1,1]
#vi.setVerbose()
pi.run()

policy_pi = pi.policy

print "Optimal policy (policy iteration) : \n" , policy_pi

policy_pg  = policy_gradient_algo( P, R , discount , path_len ,  n_paths, gamma=10 , eps=0.01)