def q_learning(self, gamma=0.9, alpha=0.1, alpha_decay=0.99, alpha_min=0.1, epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, n_iter=10000, returnStats=False): ql = mdp.QLearning(self.prob, self.rewards, gamma, alpha=alpha, alpha_decay=alpha_decay, alpha_min=alpha_min, epsilon=epsilon, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, n_iter=n_iter) run_stats = ql.run() # self.plot(run_stats, 'Frozen Lake - Q-Learning') expected_values = ql.V optimal_policy = ql.policy time = ql.time if (not returnStats): return [ expected_values, optimal_policy, len(run_stats), time, np.sum([rs['Mean V'] for rs in run_stats]) ] return run_stats, optimal_policy
def runQItsByDiscount(transitions, rewards, envName, itRange): discountRange = np.linspace(.01, .99, 20) # discountRange = [.01, .1, .25, .5, .90, .98] stats = [] for i in itRange: print() print('{0} Itt: {1}'.format(envName, i)) for discount in discountRange: alg = mdp.QLearning(transitions, rewards, discount, n_iter=i) result = alg.run() runStats = alg.run_stats[-1] stat = [ i, discount, alg.time, runStats['Iteration'], runStats['Error'], np.mean(alg.V), runStats['Reward'], alg.policy ] printStat(stat) stats.append(stat) save('{0} {1} discount policy'.format(envName, i), alg.policy) statsArr = np.array(stats) save('{0} discount stats'.format(envName), statsArr) roundedDiscounts = [round(x, 3) for x in discountRange] title = '{0} Q Learning - Time by Discount Factor'.format(envName) fig, ax = plt.subplots() for i in itRange: iStats = statsArr[statsArr[:, 0] == i] times = iStats[:, 2] plotQAx(ax, discountRange, times, title, 'Discount Factor', 'Time', 'iterations {0}'.format(i)) show(title, fig, ax) title = '{0} Q Learning - Error by Discount Factor'.format(envName) fig, ax = plt.subplots() for i in itRange: iStats = statsArr[statsArr[:, 0] == i] errors = iStats[:, 4] plotQAx(ax, discountRange, errors, title, 'Discount Factor', 'Error', 'iterations {0}'.format(i)) show(title, fig, ax) title = '{0} Q Learning - Reward by Discount Factor'.format(envName) fig, ax = plt.subplots() for i in itRange: iStats = statsArr[statsArr[:, 0] == i] rewards = iStats[:, 5] plotQAx(ax, discountRange, errors, title, 'Discount Factor', 'Reward', 'iterations {0}'.format(i)) show(title, fig, ax) stats = sorted(stats, key=lambda x: x[5], reverse=True) topPolicy = stats[0] topPolicy = topPolicy[-1] return topPolicy
def getQLFrames(env, P, R): # epsilon_min=0.1 # ln(0.1)/ln(epsilon_decay) == iteration of e_min # alpha_min=0.001 # ln(0.001)/ln(alpha_decay) == iteration of a_min np.random.seed(1) ql = mdp.QLearning(P, R, 0.9, n_iter=100000, alpha_decay=0.99999, epsilon_decay=0.9) ql.setVerbose() run_stats = ql.run() return [step['Value'].reshape(env.nrow, env.ncol) for step in run_stats]
def q_learning(P, R, gamma=0.99, alpha=0.1, alpha_decay=0.99, alpha_min=0.05, epsilon=1.0, e_min=0.1, e_decay=0.9999, n_iter=100000, plot=False, show=False, output="output", problem_name="Forest", callback=None): if alpha < alpha_min: alpha_min = alpha if epsilon < e_min: e_min = epsilon args = { "alpha": alpha, "alpha_decay": alpha_decay, "alpha_min": alpha_min, "epsilon": epsilon, "epsilon_min": e_min, "epsilon_decay": e_decay, "n_iter": n_iter, "iter_callback": callback if problem_name != "Forest" else None } ql = mdp.QLearning(P, R, gamma, **args) ql_results = ql.run() if plot: rewards = [i['Mean V'] for i in ql_results] iterations = [i['Iteration'] for i in ql_results] desc = 'Q-Learning' # plot and log results plt.clf() plt.plot(iterations, rewards) plt.title(f"{problem_name}: {desc}: Mean Utility over Iterations") plt.ylabel("Mean Utility") plt.xlabel("Iterations") plt.tight_layout() plt.savefig(f"{output}/{problem_name}-{desc}-utility.png") if show: plt.plot() else: plt.close() print( f'Q Learning time: {ql.time}\npolicy: {illustrate_policy(ql.policy, problem_name)}' ) return ql, ql_results
def q_learing_rate_decay(P, R): decays = [0.99, 0.9, 0.7, 0.5] q_stats = [] for decay in decays: q = mdp.QLearning(P, R, 0.9, alpha=0.5, alpha_decay=decay, n_iter=max_iter).run() q_stats.append(q) return q_stats, decays
def q_learing_rate_init(P, R): rates = [0.01, 0.05, 0.1, 0.2, 0.3] q_stats = [] for rate in rates: q = mdp.QLearning(P, R, 0.9, alpha=rate, epsilon_decay=1, n_iter=max_iter).run() q_stats.append(q) return q_stats, rates
def q_gamma(P, R): gammas = np.linspace(0.05, 0.95, 3) q_stats = [] for gamma in gammas: q = mdp.QLearning(P, R, gamma, alpha=0.2, alpha_decay=0.99, epsilon_decay=0.99, n_iter=max_iter).run() q_stats.append(q) return q_stats, gammas
def q_decay_rate(P, R): decays = [0.99, 0.9, 0.7, 0.5] q_stats = [] for decay in decays: q = mdp.QLearning(P, R, 0.9, alpha=0.01, alpha_decay=0.99, epsilon_decay=decay, n_iter=max_iter).run() q_stats.append(q) return q_stats, decays
def run_QL_experiments(problem, name, load=True): data = [] if load: with open(os.path.join(out, 'QL_results_' + name + '.pkl'), 'rb') as f: df = pickle.load(f) return df print("===========\nQ-Learning\n==========") for gamma in config['ql_discount']: for alpha in config['ql_alpha']: for eps in config['ql_epsilon']: for param in [config['ql_params'][name]]: P, R = problem.p(**param) mdp_util.check(P, R) for n in config['ql_iters']: ql = MDP.QLearning(P, R, gamma, alpha=alpha, epsilon=eps, n_iter=n) run_stats = ql.run() data.append([ problem.name, ql.S, ql.A, ql.gamma, alpha, eps, [i['Time'] for i in run_stats], ql.max_iter, [i['Mean V'] for i in run_stats], np.std(ql.V), [i['Max V'] for i in run_stats], [i['Reward'] for i in run_stats], [i['Error'] for i in run_stats], ql.policy ]) print(problem.name, ql.S, ql.A, gamma, n, alpha, eps) df = pd.DataFrame(data, columns=[ 'name', '#states', '#actions', 'discount', 'alpha', 'epsilon', 'time', 'iter', 'mean_V', 'max_V', 'std_V', 'reward', 'error_mean', 'policy' ]) with open(os.path.join(out, 'QL_results_' + name + '.pkl'), 'wb') as f: pickle.dump(df, f) return df
def frozen_lake_ql(P, R, gamma_range, mapping, shape): print("== Q Learning Iteration ==") print("gamma #Iterations time (ms)") prev_policy = [] prev_gamma = 0 no_diff_list = [] standard_policy = [] for gamma in gamma_range: ql = mdp.QLearning(P, R, gamma, n_iter=10e4) ql.run() timestr = "%0.3f" % (ql.time * 1000) atab = " \t" spacing = 3 gamma_str = "%0.2f" % gamma msg = gamma_str + atab * spacing + timestr print(msg) if gamma == 0.95: standard_policy.append((ql.policy, mapping, shape)) if list(ql.policy) == list(prev_policy): no_diff_list.append([prev_gamma, gamma]) prev_policy = ql.policy prev_gamma = gamma print() print("Q Learning Iteration Policy at Gamma = 0.95") contents = standard_policy.pop() print_policy(contents[0], contents[1], contents[2]) print() no_diff_len = len(no_diff_list) str_list = ["No Policy Difference Between These Gammas: "] * no_diff_len policy_diffs = zip(str_list, no_diff_list) for diff in policy_diffs: print("%s %0.2f %0.2f" % (diff[0], diff[1][0], diff[1][1])) print()
def main(): print("Create a frozen lake of Size 10x10") p = generate_FrozenLake(size=10) num_states = len(p) num_actions = len(p[0]) print("Num of States:", num_states) print("Num of Actions:", num_actions) P = np.zeros((num_actions, num_states, num_states)) R = np.zeros((num_actions, num_states, num_states)) for i in range(num_states): for j in range(num_actions): sum = 0 for prob, next_state, rewards, done in p[i][j]: P[j][i][next_state] += prob R[j][i][next_state] = rewards sum += prob # VI for gamma in [.9, 0.6]: vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.000001, max_iter=5000) stats_data = vi.run() plot_mpd_graph( stats_data, 'VI Frozen_Lake(10x10), Gamma={}, Reward plot'.format(gamma), 'Reward', 'Reward') plot_mpd_graph( stats_data, 'VI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma), 'Time(seconds)', 'Time') # PI for gamma in [.9, 0.6]: print('PI {}'.format(gamma)) pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=5000, eval_type=1) stats_data = pi.run() plot_mpd_graph( stats_data, 'PI Frozen_Lake(10x10), Gamma={}, error plot'.format(gamma), 'Error', 'Error') plot_mpd_graph( stats_data, 'PI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma), 'Time(seconds)', 'Time') # QLearning for alpha in [0.1, 0.4]: qlearn = mdp.QLearning(transitions=P, reward=R, gamma=0.6, alpha=alpha, alpha_decay=0.1, alpha_min=0.0001, epsilon=0.1, epsilon_min=0.9, epsilon_decay=0, n_iter=10000) stats_data = qlearn.run() plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Error plot'.format( alpha), 'Error', 'Error') plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Reward plot'.format( alpha), 'Reward', 'Reward') plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Time PLot'.format(alpha), 'Time(seconds)', 'Time')
ttt = mdp.ValueIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): ttt = mdp.PolicyIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): qlearner_stats = collections.defaultdict(list) ttt = hmdp.QLearning(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for stats in ttt.run_stats: qlearner_stats['state'].append(stats['State']) qlearner_stats['action'].append(stats['Action']) qlearner_stats['reward'].append(stats['Reward']) qlearner_stats['error'].append(stats['Error']) qlearner_stats['time'].append(stats['Time']) qlearner_stats['alpha'].append(stats['Alpha']) qlearner_stats['epsilon'].append(stats['Epsilon']) qlearner_stats['max_v'].append(stats['Max V']) qlearner_stats['mean_v'].append(stats['Mean V']) qlearner_stats_df = pd.DataFrame(qlearner_stats)
def run_forest(): np.random.seed(0) P, R = example.forest(S=5, r1=3, r2=15, p=0.2) print("Transition Array: ") print(P.shape) print(P) # Transition array A x S x S print("Reward Array: ") print(R.shape) print(R) # Reward array S x A # TODO gamma_range = np.array([0.1, 0.9, 0.99]) alpha_range = np.array([0.01, 0.5, 0.99]) epsilon_range = np.array([0.1, 0.5, 0.95]) e_decay_range = np.array([0.1, 0.5, 0.999]) # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9)) # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4)) # epsilon_range = np.linspace(0.1, 1.0, 10) # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9)) difference_list = np.zeros(gamma_range.shape) value_iteration_list = np.zeros(gamma_range.shape) value_time_list = np.zeros(gamma_range.shape) value_reward_list = np.zeros(gamma_range.shape) value_error_list = np.zeros(gamma_range.shape) policy_iteration_list = np.zeros(gamma_range.shape) policy_time_list = np.zeros(gamma_range.shape) policy_reward_list = np.zeros(gamma_range.shape) policy_error_list = np.zeros(gamma_range.shape) for i, gamma in enumerate(gamma_range): print('Gamma %0.2f' % gamma) vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000) vi.setVerbose() vi.run() vi_stats = vi.run_stats value_iteration_list[i] = vi_stats[-1:][0]['Iteration'] value_time_list[i] = vi_stats[-1:][0]['Time'] value_reward_list[i] = vi_stats[-1:][0]['Reward'] value_error_list[i] = vi_stats[-1:][0]['Error'] plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma)) pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1) pi.setVerbose() pi.run() stats = pi.run_stats policy_iteration_list[i] = stats[-1:][0]['Iteration'] policy_time_list[i] = stats[-1:][0]['Time'] policy_reward_list[i] = stats[-1:][0]['Reward'] policy_error_list[i] = stats[-1:][0]['Error'] plot_stats(stats, ('pi_forest_%0.2f' % gamma)) print('Policies Found') print('Value Iteration: ' + str(vi.policy)) print('Policy Iteration: ' + str(pi.policy)) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) difference_list[i] = difference1 print("Discrepancy in Policy and Value Iteration: ", difference1) print() # Plotting # Error v Iteration plt.clf() plt.title('Value Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(value_iteration_list), list(value_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_error_v_iteration.png') # Reward v Gamma plt.clf() plt.title('Value Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Value Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(value_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Value Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(value_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Value Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(value_iteration_list), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png') # Policy # Error v Iteration plt.clf() plt.title('Policy Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(policy_iteration_list), list(policy_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_error_v_iteration.png') # Gamma v Reward plt.clf() plt.title('Policy Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Policy Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(policy_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Policy Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(policy_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Policy Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(policy_iteration_list), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png') # Gamma vs Policy Differences plt.clf() plt.title('Gamma v Policy Differences') plt.xlabel('Gamma') plt.ylabel('Policy Differences') plt.plot(list(gamma_range), list(difference_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/gamma_v_differences.png') plt.close('all') prev_Q = None thresh = 1e-4 print('== Q Learning ==') for i, gamma in enumerate(gamma_range): for j, alpha in enumerate(alpha_range): for k, ep in enumerate(epsilon_range): for l, ed in enumerate(e_decay_range): # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed)) ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001, epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4) stats = ql.run() plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed))) # print('Policy: ') # print(ql.policy) # print(ql.run_stats) df = pd.DataFrame.from_records(ql.run_stats) iteration_list = df['Iteration'][-100:] windowed_reward = df['Reward'][-100:].mean() error_list = df['Error'][-100:].mean() if prev_Q is None: prev_Q = ql.Q else: variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max() res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q))) print('Result: ') print(res) print('Variation: ') print(variation) print('Mean Reward for Last 100 Iterations:') print(windowed_reward) if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0: print('Breaking! Below Thresh') print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format( gamma, alpha, ep, ed)) print('Optimal Policy: ') print(ql.policy) break
def comparing_mdps(P, R, mapping, shape): print("Comparing the Two Policies") vi = mdp.ValueIteration(P, R, 0.9, max_iter=10000) vi.run() print("Value Function: ") print(vi.V) print("Policy: ") print(vi.policy) print_policy(vi.policy, mapping, shape) print("Iter: ") print(vi.iter) print("Time: ") print(vi.time) # print(vi.run_stats) print() pi = mdp.PolicyIteration(P, R, 0.9, max_iter=100000) pi.run() print("Policy Function: ") print(pi.V) print("Policy: ") print(pi.policy) print_policy(pi.policy, mapping, shape) print("Iter: ") print(pi.iter) print("Time: ") print(pi.time) # print(pi.run_stats) print() pim = mdp.PolicyIterationModified(P, R, 0.9, max_iter=100000, epsilon=0.05) pim.run() print("Policy Modified Function: ") print(pim.V) print("Policy: ") print(pim.policy) print_policy(pim.policy, mapping, shape) print("Iter: ") print(pim.iter) print("Time: ") print(pim.time) # print(pi.run_stats) print() ql = mdp.QLearning( P, R, 0.9, n_iter=10e4, epsilon=0.1, epsilon_decay=0.1, epsilon_min=0.1, ) ql.run() print("Q Learning Function: ") print(ql.V) print("Policy: ") print(ql.policy) print_policy(ql.policy, mapping, shape) print("Mean Discrepancy: ") print(ql.error_mean) # print(ql.v_mean) print("Epsilon: ") print(ql.epsilon) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) if difference1 > 0: print("Discrepancy in Policy and Value Iteration: ", difference1) print() difference2 = sum([abs(x - y) for x, y in zip(pim.policy, vi.policy)]) if difference2 > 0: print("Discrepancy in Policy Modified and Value Iteration: ", difference2) print() difference3 = sum([abs(x - y) for x, y in zip(pim.policy, pi.policy)]) if difference3 > 0: print("Discrepancy in Policy Modified and Policy Iteration: ", difference3) print() difference4 = sum([abs(x - y) for x, y in zip(vi.policy, ql.policy)]) if difference4 > 0: print("Discrepancy in Q Learning and Value Iteration: ", difference4) print() difference5 = sum([abs(x - y) for x, y in zip(pi.policy, ql.policy)]) if difference5 > 0: print("Discrepancy in Q Learning and Policy Iteration: ", difference5) print() difference6 = sum([abs(x - y) for x, y in zip(pim.policy, ql.policy)]) if difference6 > 0: print("Discrepancy in Q Learning and Policy Iteration Modified: ", difference6) print()
def run(verbose=False): # env = gym.make('FrozenLake-v0', is_slippery=True) env = gym.make('FrozenLake8x8-v0', is_slippery=True) # env = gym.make('FrozenLake-v0') # Debug # print('env.P') # pprint(env.P) # print('env.R') # print(env.R) P, R = transform_for_MDPToolbox(env) # print('Reward') # print(R) # return print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Policy Iteration ~~~~~~~~~~') pi = mdp.PolicyIteration(P, R, 0.6, max_iter=100000) if verbose: pi.setVerbose() pi.run() util.print_debugs(pi) total_r_pi = render_env_policy(env, pi.policy, display=verbose) print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Value Iteration ~~~~~~~~~~') vi = mdp.ValueIteration(P, R, 0.6, epsilon=0.005, max_iter=10000) if verbose: vi.setVerbose() vi.run() util.print_debugs(vi) total_r_vi = render_env_policy(env, pi.policy, display=verbose) if(vi.policy == pi.policy): print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are the same! ') else: print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are NOT the same. ') print('~~~~~~~~~~ FrozenLake-v0 – Q-Learning ~~~~~~~~~~') ql = mdp.QLearning(P, R, 0.6, alpha=0.3, epsilon_min=0.005, n_iter=100000) if verbose: ql.setVerbose() start_t = time.process_time() ql.run() end_t = time.process_time() total_r_ql = render_env_policy(env, ql.policy, display=verbose) # Output print('~~~~~~~~~~ FrozenLake-v0 - Policy Iteration ~~~~~~~~~~') util.print_debugs(pi) print('Total Reward: %f' %total_r_pi) print('~~~~~~~~~~ FrozenLake-v0 - Value Iteration ~~~~~~~~~~') util.print_debugs(vi) print('Total Reward: %f' %total_r_vi) print('~~~~~~~~~~ FrozenLake-v0 - Q-Learning ~~~~~~~~~~') print('Clock time') print(end_t - start_t) print('Total Reward: %f' %total_r_pi) print(ql.policy) if(vi.policy == pi.policy): print('FrozenLake-v0 - Value and Policy Iteration policies are the same! ') else: print('FrozenLake-v0 - Value and Policy Iteration policies are NOT the same.') if(vi.policy == ql.policy): print('FrozenLake-v0 – QL and VI Policies are the same!') else: print('FrozenLake-v0 – QL and VI Policies are NOT the same.') if(pi.policy == ql.policy): print('FrozenLake-v0 – PI and PI Policies are the same!') else: print('FrozenLake-v0 – PI and VI Policies are NOT the same.') print('VI Policy') print_policy(vi.policy) # print('PI Policy') # print_policy(vi.policy) print('QL Policy') print_policy(ql.policy) # Source: # https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/ """
def vi_pi_q_comp(P, R): vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run() pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run() q = mdp.QLearning(P, R, 0.6, alpha=0.2).run() return vi, pi, q
if algo == 'pi': solver = mdp.PolicyIteration(T, R, 0.9, max_iter=5000) elif algo == 'vi': solver = mdp.ValueIteration(T, R, 0.9, epsilon=1e-6, max_iter=5000, initial_value=0) elif algo == 'q': solver = mdp.QLearning(T, R, 0.99, alpha=1.0, alpha_decay=0.9999993, alpha_min=0.1, epsilon=1.0, epsilon_min=0.2, epsilon_decay=0.999999, n_iter=6e6, run_stat_frequency=1e4) solver.setVerbose() start = time.time() solver.run() end = time.time() if problem == 'forest': print(solver.policy) elif problem == 'frozen':
def runQAlphaByIts(transitions, rewards, envName, itRange): alphas = np.linspace(.9, .01, 10) alphas = [.01, .05, .10, .20, .25] discounts = [.1, .5, .9] allStats = [] for discount in discounts: stats = [] for i in alphas: print() print('{0} Alpha: {1} Discount:{2}'.format(envName, i, discount)) for itt in itRange: alg = mdp.QLearning(transitions, rewards, discount, alpha=i, n_iter=itt) result = alg.run() runStats = alg.run_stats[-1] stat = [ i, discount, alg.time, runStats['Iteration'], runStats['Error'], np.mean(alg.V), runStats['Reward'] ] printStat(stat) stats.append(stat) save( '{0} {1} epsilon discount{2} policy'.format( envName, i, discount), alg.policy) statsArr = np.array(stats) save('{0} {1} alpha stats'.format(envName, discount), statsArr) roundedAlphas = [round(x, 3) for x in alphas] title = '{0} Q Learning - Time by Alpha Discount {1}'.format( envName, discount) fig, ax = plt.subplots() for i in alphas: iStats = statsArr[statsArr[:, 0] == i] times = iStats[:, 2] plotQAx(ax, itRange, times, title, 'Iterations', 'Time', 'alpha {0:0.3f}'.format(i)) show(title, fig, ax) title = '{0} Q Learning - Error by Alpha Discount {1}'.format( envName, discount) fig, ax = plt.subplots() for i in alphas: iStats = statsArr[statsArr[:, 0] == i] errors = iStats[:, 4] plotQAx(ax, itRange, errors, title, 'Iterations', 'Error', 'alpha {0:0.3f}'.format(i)) show(title, fig, ax) title = '{0} Q Learning - Reward by Alpha Discount {1}'.format( envName, discount) fig, ax = plt.subplots() for i in alphas: iStats = statsArr[statsArr[:, 0] == i] rewardsArr = iStats[:, 5] plotQAx(ax, itRange, rewardsArr, title, 'Iterations', 'Reward', 'alpha {0:0.3f}'.format(i)) show(title, fig, ax) allStats.extend(stats) allStats = sorted(allStats, key=lambda x: x[5], reverse=True) topPolicy = stats[0] topPolicy = topPolicy[-1] return topPolicy
def run_forest(size): seed_val = 42 np.random.seed(seed_val) random.seed(seed_val) S = size r1 = 10 # The reward when the forest is in its oldest state and action ‘Wait’ is performed r2 = 50 # The reward when the forest is in its oldest state and action ‘Cut’ is performed p = 0.1 P, R = mdptoolbox.example.forest(S=S, r1=r1, r2=r2, p=p) # Defaults left the same epsilons = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001] epsilons = [0.00001, 0.000001] gammas = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999, 0.9999, 0.99999] learning_rates = [ 0.001, 0.01, 0.00001, 0.0001, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0 ] lr_decays = [ 1.0, 0.99, 0.9999, 0.999, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ] lr_mins = [0.00001, 0.0001, 0.001, 0.01, 0] epsilons = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] epsilon_decays = [0.99, 0.9999, 0.99999, 0.999999, 0.999, 0.9, 0.8, 0.7] epsilon_mins = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0] best_lr, best_e, best_g, best_ed, best_em, best_rew = 0, 0, 0, 0, 0, -1 '''for em in epsilon_mins: for am in lr_mins: for ad in lr_decays: for e in epsilons: for g in gammas: for a in learning_rates: for ed in epsilon_decays: pi = mdp.QLearning(P, R, gamma=g, epsilon=e, epsilon_decay=ed, epsilon_min=em, n_iter=10000, alpha=a, alpha_min=am, alpha_decay=ad) pi.run() rew = run_episodes(pi.policy, S, R, p, 1000, 100) print(rew, '-', e, ed, em, a, ad, am, g)''' # g e ed em a ad am rew''' tests = [[0.1, 0.000001, 0.99, 0.0001, 0.6, 0.5, 0.001]] # g e ed em a ad am rew # 0.1 1.00E-06 0.99 0.0001 0.6 0.5 0.001 4032 # 0.1 1.00E-06 0.99 1.00E-05 0.001 0.5 0.01 429.2 if size < 100: tests = [[0.1, 1.0, 0.7, 0.00001, 0.0001, 1.0, 0.00001]] else: tests = [[0.6, 1.0, 0.999999, 0.00001, 0.8, 1.0, 0.01]] if 1 == 1: # print(e, ed, em, a, ad, am, g, rew, ) best_pol_arr = [] print(size) for t in tests: for e in epsilons: Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=e, epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() best_pol_arr.append(list(Q_qlearning.policy)) #print(run_episodes(Q_qlearning.policy, S, R, p, 100000, 100)) # Plot out optimal policy # Citation: https://stackoverflow.com/questions/52566969/python-mapping-a-2d-array-to-a-grid-with-pyplot print(epsilons) cmap = colors.ListedColormap(['blue', 'red']) fig, ax = plt.subplots(figsize=(12, 3.5)) plt.title("Forest Q-Learning Policy - Red = Cut, Blue = Wait") epsilons.reverse() plt.xticks(fontsize=15) plt.xlabel('State', fontsize=15) plt.ylabel('Epsilon', fontsize=15) plt.pcolor(best_pol_arr[::-1], cmap=cmap, edgecolors='k', linewidths=0) ax.set_yticklabels(epsilons, fontsize=15) ax.tick_params(left=False) # remove the ticks plt.savefig('Images\\QL-Forest-Policy-' + str(size) + '.png') plt.show() mean_val = [i["Mean V"] for i in Q_qlearning.run_stats] error = [i["Error"] for i in Q_qlearning.run_stats] reward = [i["Reward"] for i in Q_qlearning.run_stats] # Plot Delta vs iterations fig, ax1 = plt.subplots() color = 'tab:blue' ax1.set_ylabel('Reward/Error', color=color) ax1.semilogy(error, color=color, label='Error') ax1.semilogy(reward, color='darkblue', label='Reward') ax1.legend() ax2 = ax1.twinx( ) # instantiate a second axes that shares the same x-axis color = 'tab:red' ax2.set_xlabel('Iterations') ax2.set_ylabel('Mean V', color=color) ax2.semilogy(mean_val, color=color) ax2.tick_params(axis='y', labelcolor=color) ax2.tick_params(axis='y', labelcolor=color) plt.title('V/Reward/Error vs. Iterations') plt.savefig('Images\\QL-Forest-RunStats' + str(size) + '.png') plt.show() best_rew = 0 for em in epsilon_mins: for am in lr_mins: for ad in lr_decays: for e in epsilons: for g in gammas: for a in learning_rates: for ed in epsilon_decays: Q_qlearning = mdp.QLearning( P, R, gamma=g, epsilon=e, epsilon_decay=ed, epsilon_min=em, n_iter=10000, alpha=a, alpha_min=am, alpha_decay=ad) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 200) if rew > best_rew: best_rew = rew print( e, ed, em, a, ad, am, g, rew, ) for t in tests: num_seeds = 10 for g in gammas: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=g, epsilon=t[1], epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('g', x, g, rew, Q_qlearning.run_stats[-1]['Mean V']) for em in epsilon_mins: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=t[2], epsilon_min=em, n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('em', x, em, rew, Q_qlearning.run_stats[-1]['Mean V']) for e in epsilons: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=e, epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() #print(Q_qlearning.policy) rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('e', x, e, rew, Q_qlearning.run_stats[-1]['Mean V']) for lr in learning_rates: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=lr, alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('lr', x, lr, rew, Q_qlearning.run_stats[-1]['Mean V']) for ld in lr_decays: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=ld) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('ld', x, ld, rew, Q_qlearning.run_stats[-1]['Mean V']) for lm in lr_mins: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=lm, alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('lm', x, lm, rew, Q_qlearning.run_stats[-1]['Mean V']) for e in epsilons: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=e, epsilon_decay=t[2], epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('e', x, e, rew, Q_qlearning.run_stats[-1]['Mean V']) for ed in epsilon_decays: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=ed, epsilon_min=t[3], n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('ed', x, ed, rew, Q_qlearning.run_stats[-1]['Mean V']) for em in epsilon_mins: tot_rew = 0 cnt = 0 for x in range(num_seeds): cnt += 1 seed_val = x np.random.seed(seed_val) random.seed(seed_val) Q_qlearning = mdp.QLearning(P, R, gamma=t[0], epsilon=t[1], epsilon_decay=t[2], epsilon_min=em, n_iter=10000, alpha=t[4], alpha_min=t[5], alpha_decay=t[6]) Q_qlearning.run() rew = run_episodes(Q_qlearning.policy, S, R, p, 1000, 100) tot_rew += rew print('em', x, em, rew, Q_qlearning.run_stats[-1]['Mean V'])
def run(verbose=False): # MDP Forest Problem # transitions, reward = example.forest() nS = 1000 # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False) transitions, reward = example.forest(S=nS, r1=1045, r2=1025, p=0.01, is_sparse=False) # print(transitions) # print (reward) # return print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000) if verbose: pi.setVerbose() pi.run() util.print_debugs(pi) # print(pi.run_stats) # return print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000) if verbose: vi.setVerbose() vi.run() util.print_debugs(vi) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') # transitions, reward, gamma, # alpha=0.1, alpha_decay=0.99, alpha_min=0.001, # epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, # n_iter=10000, skip_check=False, iter_callback=None, # run_stat_frequency=None): ql = mdp.QLearning(transitions, reward, 0.75, alpha=0.3, epsilon_min=0.005, n_iter=500000) if verbose: ql.setVerbose() start_t = time.process_time() ql.run() end_t = time.process_time() # Output print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') util.print_debugs(pi) print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') util.print_debugs(vi) print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') print(ql.policy) print('Q-Learning # of Iterations: %i' % q_counter) print('Clock time') print(end_t - start_t) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') if (vi.policy == ql.policy): print('Forest – QL and VI Policies are the same!') else: print('Forest – QL and VI Policies are NOT the same.') if (pi.policy == ql.policy): print('Forest – PI and PI Policies are the same!') else: print('Forest – PI and VI Policies are NOT the same.') # A Q-Learning Algorithm # # Source: # https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/ """
tune_ql = False if tune_ql: # max iter if False: iter_range = [ 10**4, 5 * (10**4), 10**5, 5 * (10**5), 10**6, 5 * (10**6) ] ql_time = [] ql_max_v = [] for iter in iter_range: ql = mdp.QLearning(transitions, rewards, gamma=0.99, epsilon=1.0, n_iter=iter) ql.run() ql_time.append(ql.time) ql_max_v.append(np.max(ql.V)) plt.figure() plt.plot(iter_range, ql_time, label="QL") plt.xlabel('iterations') plt.ylabel('time') plt.title('iteration vs time') plt.legend() plt.savefig("charts/lake_ql_iter_time") plt.figure()
def run_qlearn(envs, gamma=0.96, n_iters=10000, verbose=True): all_rewards = [] all_mean_discrepancies_dfs = [] all_error_dfs = [] time_per_run = [] num_episodes = len(envs) for env, episode in zip(envs, range(num_episodes)): P, R = env fm_qlearn = mdp.QLearning( transitions=P, reward=R, gamma=gamma, n_iter=n_iters, ) # if verbose: fm_qlearn.setVerbose() t0 = time() fm_qlearn.run() time_elapsed = time() - t0 time_per_run.append(time_elapsed) if verbose: print("Forest Management QLearning Episode", episode, "runtime (s):", time_elapsed) # add mean discrepancies for each episode v_means = [] for v_mean in fm_qlearn.v_mean: v_means.append(np.mean(v_mean)) v_mean_df = pd.DataFrame(v_means, columns=['v_mean']) # v_mean_df.iloc[0: n_iters / 100, :] = v_means all_mean_discrepancies_dfs.append(v_mean_df) if verbose: print("Forest Management QLearning Episode", episode, "mean discrepancy:", '\n', v_mean_df, '\n') error_over_iters = fm_qlearn.error_over_iters # print(error_over_iters) error_plot_df = pd.DataFrame(0, index=np.arange(1, n_iters + 1), columns=['error']) error_plot_df.iloc[0:len(error_over_iters), :] = error_over_iters all_error_dfs.append(error_plot_df) print_policy(fm_qlearn.policy) # rewards = calc_reward(fm_qlearn.policy, R) # total_reward = np.sum(rewards) # all_rewards.append(total_reward) # if verbose: print("Forest Management QLearning Episode", episode, "reward:", total_reward, '\n') # filename = "tmp/fm_qlearn_stats.csv" # rewards_df = pd.DataFrame(all_rewards) # rewards_df.to_csv(filename) combined_error_df = pd.concat(all_error_dfs, axis=1) mean_error_per_iter = combined_error_df.mean(axis=1) mean_error_per_iter.to_csv("tmp/fm_qlearn_error.csv") # plot the error over iterations title = "FM QL: error vs. iter (mean over " + str( num_episodes) + " episodes)" path = "graphs/fm_ql_error_iter.png" plotting.plot_error_over_iters(mean_error_per_iter, title, path) # show avg time per run avg_time_per_run = np.mean(np.array(time_per_run)) print("FM QL - avg seconds per run:", avg_time_per_run, '\n')
# avg V, n_iter, time alpha_vals = [.1, .3, .5, .7, .9] epslion_vals = [.2, .4, .6, .8] big_vs = [] big_n = [] big_t = [] for epsilon in epslion_vals: avg_vs = [] n_iters = [] times = [] for alpha in alpha_vals: q = mdp.QLearning(P_small, R_small, gamma=.9999, alpha=alpha, alpha_decay=1, epsilon=epsilon, epsilon_decay=.99) stats = q.run() avg_v = stats[-1]['Mean V'] n_iter = len(stats) time = stats[-1]['Time'] avg_vs.append(avg_v) n_iters.append(n_iter) times.append(time) big_vs.append(avg_vs) big_n.append(n_iters)
def frozen_lake_all(P, R, gamma_range, mapping, shape): vi_iteration_list = np.zeros(gamma_range.shape) vi_time_list = np.zeros(gamma_range.shape) vi_reward_list = np.zeros(gamma_range.shape) vi_error_list = np.zeros(gamma_range.shape) pi_iteration_list = np.zeros(gamma_range.shape) pi_time_list = np.zeros(gamma_range.shape) pi_reward_list = np.zeros(gamma_range.shape) pi_error_list = np.zeros(gamma_range.shape) diff_list = np.zeros(gamma_range.shape) expected_policy = None for i, gamma in enumerate(gamma_range): print('Gamma %0.2f' % gamma) vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=5000) # vi.setVerbose() vi.run() vi_iteration_list[i] = vi.run_stats[-1:][0]['Iteration'] vi_time_list[i] = vi.run_stats[-1:][0]['Time'] vi_reward_list[i] = vi.run_stats[-1:][0]['Reward'] vi_error_list[i] = vi.run_stats[-1:][0]['Error'] pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=5000, eval_type=1) # pi.setVerbose() pi.run() pi_iteration_list[i] = pi.run_stats[-1:][0]['Iteration'] pi_time_list[i] = pi.run_stats[-1:][0]['Time'] pi_reward_list[i] = pi.run_stats[-1:][0]['Reward'] pi_error_list[i] = pi.run_stats[-1:][0]['Error'] print('Value Iteration Policy Found: ' + str(vi.policy)) print_policy(vi.policy, mapping, shape) print('Policy Iteration Policy Found: ' + str(pi.policy)) print_policy(pi.policy, mapping, shape) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) diff_list[i] = difference1 print('Discrepancy in Policy and Value Iteration: ', difference1) if difference1 == 0: expected_policy = vi.policy print() # Plotting # Error v Iteration plt.clf() plt.title('Value Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(vi_iteration_list), list(vi_error_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_error_v_iteration.png') # Reward v Gamma plt.clf() plt.title('Value Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(vi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Value Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(vi_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Value Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(vi_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Value Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(vi_iteration_list), list(vi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_reward_v_iterations.png') # Policy # Error v Iteration plt.clf() plt.title('Policy Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.scatter(list(pi_iteration_list), list(pi_error_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_error_v_iteration.png') # Gamma v Reward plt.clf() plt.title('Policy Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.scatter(list(gamma_range), list(pi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Policy Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.scatter(list(pi_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Policy Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.scatter(list(pi_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Policy Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.scatter(list(pi_iteration_list), list(pi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_reward_v_iterations.png') # Gamma vs Policy Differences plt.clf() plt.title('Gamma v Policy Differences') plt.xlabel('Gamma') plt.ylabel('Policy Differences') plt.scatter(list(gamma_range), list(diff_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/gamma_v_differences.png') # TODO gamma_range = np.array([0.8, 0.9, 0.99]) alpha_range = np.array([0.1, 0.9, 0.99]) epsilon_range = np.array([0.1, 0.5, 0.9, 0.999]) e_decay_range = np.array([0.1, 0.5, 0.9, 0.999]) # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4)) # epsilon_range = np.linspace(0.1, 1.0, 10) # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9)) prev_Q = None thresh = 1e-4 print('== Q Learning ==') for i, gamma in enumerate(gamma_range): for j, alpha in enumerate(alpha_range): for k, ep in enumerate(epsilon_range): for l, ed in enumerate(e_decay_range): # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed)) ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001, epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4) stats = ql.run() plot_stats(stats, ('ql_frozen_lake_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed))) # print('Policy: ') # print(ql.policy) # print(ql.run_stats) df = pd.DataFrame.from_records(ql.run_stats) iteration_list = df['Iteration'][-100:] windowed_reward = df['Reward'][-100:].mean() error_list = df['Error'][-100:].mean() if prev_Q is None: prev_Q = ql.Q else: variation = np.absolute( np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max() res = np.abs( np.subtract(np.asarray(prev_Q), np.asarray(ql.Q))) print('Result: ') print(res) print('Variation: ') print(variation) print('Mean Reward for Last 100 Iterations:') print(windowed_reward) if np.all( res < thresh ) or variation < thresh or windowed_reward > 45.0: print('Breaking! Below Thresh') print( 'Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}' .format(gamma, alpha, ep, ed)) print('Optimal Policy: ') print(ql.policy) break