def Q_learner(P, R, id=None): alpha_mins = [0.0001, 0.01] epsilon_decays = [0.99, 0.999] gammas = [0.99, 0.7] tracker = '' Rs = [] runtimes = [] params = [] policies = [] for g in gammas: print(g) for ed in epsilon_decays: for am in alpha_mins: start = time.time() Q = QLearning(P, R, gamma=g, alpha_min=am, epsilon_decay=ed, n_iter=10000000) Q.run() end = time.time() runtimes.append(end - start) r = test_policy(P, R, Q.policy) Rs.append(r) policies.append(Q.policy) params.append('gamma={}, a_min={}, e_dec={}'.format(g, am, ed)) tracker += 'gamma={}, alpha_min={}, eplison_dec={}: reward was {}, time was {}\n'.format( g, am, ed, r, end - start) # write with open('figures/Q_variables_forest_{}_mil.txt'.format(id), 'w') as f: f.write(tracker) with open('figures/Q_policies_forest_{}_mil.txt'.format(id), 'w') as f: for i, p in enumerate(params): f.write('{}: policy={}'.format(p, policies[i])) # plot plt.plot(params, Rs) plt.title('Q learning params avg reward') plt.ylabel('Avg rewards') plt.xticks(rotation=90) plt.tight_layout() plt.savefig('figures/Q_rewards_forest_{}_mil.png'.format(id)) plt.clf() plt.plot(params, runtimes) plt.title('Q learning runtimes') plt.xticks(rotation=90) plt.tight_layout() plt.savefig('figures/Q_runtimes_forest_{}_mil.png'.format(id)) plt.clf() print('done')
class QL: """ Class to run QL """ def __init__(self, name, transition, reward, config, outdir): """ Constructor for QL """ self.name = name self.title = "QL" self.transition = transition self.reward = reward self.outdir = outdir self.config = config self.policy = None self.results = None self.dataframe = None self.instance = QLearning(transition, reward, gamma=config['gamma'], n_iter=config['n_iter'], alpha=config['alpha'], alpha_decay=config['alpha_decay'], alpha_min=config['alpha_min'], epsilon=config['epsilon'], epsilon_decay=config['epsilon_decay'], epsilon_min=config['epsilon_min']) def run(self): """ Run QL """ self.results = self.instance.run() self.dataframe = pd.DataFrame(self.results) self.policy = self.instance.policy
def qlearning(): deltas = {} rewards = {} for size in [10, 20, 40, 80]: P, R = forest(S=size, r1=1, r2=5, p=.1) ql = QLearning(P, R, 0.90, epsilon_decay=.998) ql.run() delta = [ql.run_stats[i]['Error'] for i in range(len(ql.run_stats))] reward = [ql.run_stats[i]['Reward'] for i in range(len(ql.run_stats))] epilson = [ ql.run_stats[i]['Epsilon'] for i in range(len(ql.run_stats)) ] deltas[size] = delta rewards[size] = reward print(ql.policy) forest_plot.plot_ql_forest_convergence_size(deltas)
def q_learning(P, R, epsilon, discount=[0.9], n_iter=[1000000]): df_ql = pd.DataFrame(columns=["Iterations", "Discount", "Reward", "Time", "Policy", "Value Function", "Training Rewards"]) count = 0 for i in n_iter: for disc in discount: q = QLearning(P, R, disc, epsilon = epsilon, n_iter = i) q.run() reward = test_policy(P, R, q.policy) count += 1 print("{}: {}".format(count, reward)) st = q.run_stats t_rewards = [s['Reward'] for s in st] info = [i, disc, reward, q.time, q.policy, q.V, t_rewards] df_length = len(df_ql) df_ql.loc[df_length] = info return df_ql
ex = OpenAI_MDPToolbox('FrozenLake-v0', False) P = ex.P R = ex.R disc = [0.1, 0.3, 0.5, 0.7, 0.9] ep = [0.00099, 0.001, 0.005, 0.01, 0.03] alpha = [1.0, 0.9, 0.5, 0.3, 0.1, 0.01] results = [] for d in disc: ql = QLearning( P, # transitions R, # rewards d, # discount alpha=0.1, alpha_decay=0.99, alpha_min=0.001, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, n_iter=10000, skip_check=False, iter_callback=None, run_stat_frequency=None) ql.run() # print('q learning Q matrix:', ql.Q) print('q learning value function:', ql.V) # print('q learning mean discrepancy:', ql.mean_discrepancy) print('q learning best policy:', ql.policy) results.append(ql) plot_rewards(disc, results, 'Q-Learning Discount/Rewards FrozenLake', 'q_learning_discount_rewards_frozenlake', 'Discount')
fig, ax = plt.subplots() ax.plot(iterations, reward) ax.set(xlabel="Iterations", ylabel="Reward", title="Frozen Lake Policy Iteration") ax.grid() fig.savefig("frozen-lake.pi.png") print("== Q Learning ==") values = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.99] resultRewards = [None] * len(values) resultIterations = [None] * len(values) i = 0 for v in values: QL = QLearning( P, R, Gamma, n_iter=10000000, epsilon=0.1, epsilon_decay=v, epsilon_min=0.1 ) # run QL QL.setVerbose() QL.run() print("QL") print(QL.time) print(QL.run_stats[-1:]) resultIterations[i] = np.zeros(len(QL.run_stats)) resultRewards[i] = np.zeros(len(QL.run_stats)) j = 0 sum = 0 for stat in QL.run_stats: sum += stat["Reward"] resultIterations[i][j] = stat["Iteration"]
R[s][a] += decision[2] value_f = [] policy = [] iters = [] time_array = [] Q_table = [] rew_array = [] # Plots for variable iterations niters = [10000, 25000, 50000, 100000, 250000, 500000] for niter in niters: print("doing iteration ", niter) ql = QLearning(P, R, 0.95, n_iter=niter) ql.run() time = ql.time maxV = np.amax(ql.V) rew_array.append(maxV) Q_table.append(ql.Q) policy.append(ql.policy) time_array.append(time) plt.figure() plt.plot(niters, rew_array, label='epsilon=0.95') plt.title('Frozenlake QLearning: Iteration vs average rewards') plt.savefig(plot_path + 'qlearning_iteration_rewards_analysis.png') plt.figure() plt.plot(niters, time_array, label='epsilon=0.95')
P, R = forest(num_states, r1, r2, p_fire) vi = ValueIteration(P, R, 0.96, 1e-20) vi.run() P2, R2 = forest(num_states, r1, r2, 0.8) vi2 = ValueIteration(P2, R2, 0.96, 1e-20) vi2.run() # # calculate and plot the v_mean # iter_score(vi, vi2) # gamma_iter_value() # # # pi = PolicyIteration(P, R, 0.96) pi.run() pi2 = PolicyIteration(P2, R2, 0.96) pi2.run() # iter_score(pi, pi2) # #iter_policy(pi, pi2) # gamma_iter_value_p() q = QLearning(P, R, 0.4, alpha=0.9, n_iter=100000) q.run() q2 = QLearning(P2, R2, 0.4, alpha=0.9, n_iter=100000) q2.run() iter_score(q, q2)
reward = np.zeros(len(PI.run_stats)) i = 0 for stat in PI.run_stats: iterations[i] = stat['Iteration'] reward[i] = stat['Reward'] i += 1 fig, ax = plt.subplots() ax.plot(iterations, reward) ax.set(xlabel='Iterations', ylabel='Reward', title='Forest Policy Iteration') ax.grid() fig.savefig("forest.pi.png") QL = QLearning(P, R, Gamma, n_iter=1000000, alpha_decay=0.1) # run QL QL.setVerbose() QL.run() print('QL') print(QL.time) print(QL.run_stats[-1:]) iterations = np.zeros(len(QL.run_stats)) reward = np.zeros(len(QL.run_stats)) i = 0 sum = 0 for stat in QL.run_stats: sum += stat['Reward'] iterations[i] = stat['Iteration'] reward[i] = sum