def pi_experiment_iter_linear(P, R): vi_iter = mdp.PolicyIteration(P, R, 0.6, eval_type=1).run() vi_linear = mdp.PolicyIteration(P, R, 0.6, eval_type=0).run() print('Iterative stats: ', vi_iter[-1]) print('LP stats: ', vi_linear[-1]) return vi_iter, vi_linear
def policy_iteration(self, discount=0.999, save_policy=False, save_plot=False): pi = mdp.PolicyIteration(transitions=self.prob, reward=self.rewards, gamma=discount) run_stats = pi.run() self.plot(run_stats, 'Fire Management - Policy Iteration') expected_values = pi.V optimal_policy = pi.policy iterations = pi.iter time = pi.time return [expected_values, optimal_policy, iterations, time]
def pi_experiment_gamma(P, R): """ experiments on effectiveness of gamma on # of iterations and rewards :param P: :param R: :return: """ gammas = np.linspace(0.05, 0.95, 10) vi_stats = [] for gamma in gammas: vi = mdp.PolicyIteration(P, R, gamma) vi_stat = vi.run() vi_stats.append(vi_stat) return vi_stats, gammas
def run_PI_experiments(problem, name, load=True): data = [] if load: with open(os.path.join(out, 'PI_results_' + name + '.pkl'), 'rb') as f: df = pickle.load(f) return df print("===========\nPolicy Iteration\n==========") for gamma in config['discount']: for param in problem.params: P, R = problem.p(**param) mdp_util.check(P, R) for eval_type in ['matrix']: pi = MDP.PolicyIteration(P, R, gamma, max_iter=1000, eval_type=eval_type, skip_check=False) run_stats = pi.run() data.append([ problem.name, pi.S, pi.A, pi.gamma, [i['Time'] for i in run_stats], pi.iter, pi.max_iter, pi.eval_type, [i['Mean V'] for i in run_stats], np.std(pi.V), [i['Max V'] for i in run_stats], [i['Reward'] for i in run_stats], [i['Error'] for i in run_stats], pi.policy ]) print(problem.name, pi.S, pi.A, gamma, eval_type) df = pd.DataFrame(data, columns=[ 'name', '#states', '#actions', 'discount', 'time', 'iter', 'max_iter', 'eval_type', 'mean_V', 'max_V', 'std_V', 'reward', 'error_mean', 'policy' ]) with open(os.path.join(out, 'PI_results_' + name + '.pkl'), 'wb') as f: pickle.dump(df, f) return df
def runPolicy(transitions, rewards, envName, maxIters, discountRange): stats = [] for discount in discountRange: alg = mdp.PolicyIteration(transitions, rewards, discount, max_iter=maxIters) result = alg.run() runStats = alg.run_stats[-1] stats.append([ discount, alg.time, runStats['Iteration'], runStats['Error'], runStats['Reward'], np.mean(alg.V), alg.policy ]) statsT = list(zip(*stats)) discounts = statsT[0] times = statsT[1] iterations = statsT[2] errors = statsT[3] wins = statsT[4] rewards = statsT[5] stats = sorted(stats, key=lambda x: x[5], reverse=True) topPolicy = stats[0] topPolicy = topPolicy[-1] title = '{0} Policy Iteration - Time'.format(envName) fig, ax = plotAx(discounts, times, title, 'Discount Factor', 'Time') show(title, fig, ax) title = '{0} Policy Iteration - Iterations'.format(envName) fig, ax = plotAx(discounts, iterations, title, 'Discount Factor', 'Iteration') show(title, fig, ax) title = '{0} Policy Iteration - Error'.format(envName) fig, ax = plotAx(discounts, errors, title, 'Discount Factor', 'Error') show(title, fig, ax) title = '{0} Policy Iteration - Reward'.format(envName) fig, ax = plotAx(discounts, rewards, title, 'Discount Factor', 'Rewards') show(title, fig, ax) return topPolicy
def frozen_lake_pi(P, R, gamma_range, mapping, shape): print("== Policy Iteration ==") print("gamma #Iterations time (ms)") prev_policy = [] prev_gamma = 0 no_diff_list = [] standard_policy = [] for gamma in gamma_range: pi = mdp.PolicyIteration(P, R, gamma, max_iter=10000) pi.run() timestr = "%0.3f" % (pi.time * 1000) atab = " \t" if pi.iter <= 99: spacing = 4 else: spacing = 3 gamma_str = "%0.2f" % gamma msg = gamma_str + atab + str(pi.iter) + atab * spacing + timestr print(msg) if gamma == 0.95: standard_policy.append((pi.policy, mapping, shape)) if list(pi.policy) == list(prev_policy): no_diff_list.append([prev_gamma, gamma]) prev_policy = pi.policy prev_gamma = gamma print() print("Policy Iteration Policy at Gamma = 0.95") contents = standard_policy.pop() print_policy(contents[0], contents[1], contents[2]) print() no_diff_len = len(no_diff_list) str_list = ["No Policy Difference Between These Gammas: "] * no_diff_len policy_diffs = zip(str_list, no_diff_list) for diff in policy_diffs: print("%s %0.2f %0.2f" % (diff[0], diff[1][0], diff[1][1])) print()
def vi_pi_comp(P, R): vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run() pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run() return vi, pi
def run_pi(envs, gamma=0.96, max_iters=1000, verbose=True): all_rewards = [] all_iters = [] all_error_means = [] all_error_dfs = [] time_per_run = [] num_episodes = len(envs) for env, episode in zip(envs, range(num_episodes)): P, R = env fm_pi = mdp.PolicyIteration( transitions=P, reward=R, gamma=0.96, ) # if verbose: fm_pi.setVerbose() t0 = time() fm_pi.run() time_elapsed = time() - t0 time_per_run.append(time_elapsed) if verbose: print("Forest Management PI Episode", episode, "runtime (s):", time_elapsed) # add error means for each episode error_m = np.sum(fm_pi.error_mean) all_error_means.append(error_m) if verbose: print("Forest Management PI Episode", episode, "error mean:", error_m, '\n') error_over_iters = fm_pi.error_over_iters variation_over_iters = fm_pi.variation_over_iters # print(error_over_iters) error_plot_df = pd.DataFrame(0, index=np.arange(1, max_iters + 1), columns=['error']) error_plot_df.iloc[0:len(error_over_iters), :] = error_over_iters all_error_dfs.append(error_plot_df) print_policy(fm_pi.policy) # print(fm_pi.policy, '\n', R, '\n') # rewards = calc_reward(fm_pi.policy, R) # total_reward = np.sum(rewards) # all_rewards.append(total_reward) # if verbose: print("Forest Management PI Episode", episode, "reward:", total_reward, '\n') all_iters.append(fm_pi.iter) if verbose: print("Forest Management PI Episode", episode, "last iter:", fm_pi.iter, '\n') filename = "fm_pi_stats.csv" output_to_csv(filename, all_iters, all_rewards) combined_error_df = pd.concat(all_error_dfs, axis=1) mean_error_per_iter = combined_error_df.mean(axis=1) mean_error_per_iter.to_csv("tmp/fm_pi_error.csv") # plot the error over iterations title = "FM PI: error vs. iter (mean over " + str( num_episodes) + " episodes)" path = "graphs/fm_pi_error_iter.png" plotting.plot_error_over_iters(mean_error_per_iter, title, path, xlim=200) # show avg time per run avg_time_per_run = np.mean(np.array(time_per_run)) print("FM PI - avg seconds per run:", avg_time_per_run, '\n')
def findBestPolicyForGridWorlds(worlds, grid, starts, goals): qlearningIter = [1000, 10000] worldCntr = 1 for data in worlds: size = len(data) holesCoords = [] for row in range(0, data.shape[0]): for col in range(0, data.shape[1]): if data[row, col] == 1: # Obstacle holesCoords.append((row, col)) if data[row, col] == 2: # El roboto start = (row, col) if data[row, col] == 3: # Goal goal = (row, col) transitions, reward, discount, lake = get_environement( data, size, holesCoords, start, goal) #Policy iteration policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0) policy_iteration.run() print_as_grid(policy_iteration.policy, lake.lake, size) print(policy_iteration.time) print(policy_iteration.iter) actions = getActions(policy_iteration.policy, start, goal, size) svg = gv.gridworld(n=size, tile2classes=lake.tile2classes, actions=actions, extra_css='goal', start=start, policyList=policy_iteration.policy) svg.saveas("Figures/Grid/PI-Final-Path for World " + str(worldCntr) + ".svg", pretty=True) #Value iteration value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.001, max_iter=1000, initial_value=0) value_iteration.run() print_as_grid(value_iteration.policy, lake.lake, size) print(value_iteration.time) print(value_iteration.iter) actions = getActions(value_iteration.policy, start, goal, size) svg = gv.gridworld(n=size, tile2classes=lake.tile2classes, actions=actions, extra_css='goal', start=start, policyList=value_iteration.policy) svg.saveas("Figures/Grid/VI-Final-Path for World " + str(worldCntr) + ".svg", pretty=True) #Q-Learning q_learning = QLearner.QLearningEx(transitions, reward, grid=grid[worldCntr - 1], start=starts[worldCntr - 1], goals=goals[worldCntr - 1], n_iter=qlearningIter[worldCntr - 1], n_restarts=1000, alpha=0.2, gamma=0.9, rar=0.1, radr=0.99) q_learning.run() print_as_grid(q_learning.policy, lake.lake, size) #print(q_learning.time) actions = getActions(q_learning.policy, start, goal, size) svg = gv.gridworld(n=size, tile2classes=lake.tile2classes, actions=actions, extra_css='goal', start=start, policyList=q_learning.policy) svg.saveas("Figures/Grid/QL-Final-Path for World " + str(worldCntr) + ".svg", pretty=True) worldCntr += 1
def comparing_mdps(P, R, mapping, shape): print("Comparing the Two Policies") vi = mdp.ValueIteration(P, R, 0.9, max_iter=10000) vi.run() print("Value Function: ") print(vi.V) print("Policy: ") print(vi.policy) print_policy(vi.policy, mapping, shape) print("Iter: ") print(vi.iter) print("Time: ") print(vi.time) # print(vi.run_stats) print() pi = mdp.PolicyIteration(P, R, 0.9, max_iter=100000) pi.run() print("Policy Function: ") print(pi.V) print("Policy: ") print(pi.policy) print_policy(pi.policy, mapping, shape) print("Iter: ") print(pi.iter) print("Time: ") print(pi.time) # print(pi.run_stats) print() pim = mdp.PolicyIterationModified(P, R, 0.9, max_iter=100000, epsilon=0.05) pim.run() print("Policy Modified Function: ") print(pim.V) print("Policy: ") print(pim.policy) print_policy(pim.policy, mapping, shape) print("Iter: ") print(pim.iter) print("Time: ") print(pim.time) # print(pi.run_stats) print() ql = mdp.QLearning( P, R, 0.9, n_iter=10e4, epsilon=0.1, epsilon_decay=0.1, epsilon_min=0.1, ) ql.run() print("Q Learning Function: ") print(ql.V) print("Policy: ") print(ql.policy) print_policy(ql.policy, mapping, shape) print("Mean Discrepancy: ") print(ql.error_mean) # print(ql.v_mean) print("Epsilon: ") print(ql.epsilon) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) if difference1 > 0: print("Discrepancy in Policy and Value Iteration: ", difference1) print() difference2 = sum([abs(x - y) for x, y in zip(pim.policy, vi.policy)]) if difference2 > 0: print("Discrepancy in Policy Modified and Value Iteration: ", difference2) print() difference3 = sum([abs(x - y) for x, y in zip(pim.policy, pi.policy)]) if difference3 > 0: print("Discrepancy in Policy Modified and Policy Iteration: ", difference3) print() difference4 = sum([abs(x - y) for x, y in zip(vi.policy, ql.policy)]) if difference4 > 0: print("Discrepancy in Q Learning and Value Iteration: ", difference4) print() difference5 = sum([abs(x - y) for x, y in zip(pi.policy, ql.policy)]) if difference5 > 0: print("Discrepancy in Q Learning and Policy Iteration: ", difference5) print() difference6 = sum([abs(x - y) for x, y in zip(pim.policy, ql.policy)]) if difference6 > 0: print("Discrepancy in Q Learning and Policy Iteration Modified: ", difference6) print()
# region PI # avg V, n_iter, time ep_vals = [.0001] gamma_vals = [.2, .5, .8, .95, .999] big_vs = [] big_n = [] big_t = [] for epsilon in ep_vals: avg_vs = [] n_iters = [] times = [] for gamma in gamma_vals: pi = mdp.PolicyIteration(P_small, R_small, gamma=gamma) stats = pi.run() avg_v = stats[-1]['Mean V'] n_iter = len(stats) time = stats[-1]['Time'] avg_vs.append(avg_v) n_iters.append(n_iter) times.append(time) big_vs.append(avg_vs) big_n.append(n_iters) big_t.append(times) plt.plot(gamma_vals, big_vs[0], label="Epsilon=" + str(ep_vals[0]))
############################# if problem == 'frozen': np.random.seed(1) mapsize = 32 map = generate_random_map(size=mapsize, p=0.96) env = gym.make('FrozenLake-v0', desc=map) env._max_episode_steps = 1e6 elif problem == 'forest': env = gym.make('Forest-v0') env._max_episode_steps = 1e3 state = env.reset() R, T = evaluateRT(env) if algo == 'pi': solver = mdp.PolicyIteration(T, R, 0.9, max_iter=5000) elif algo == 'vi': solver = mdp.ValueIteration(T, R, 0.9, epsilon=1e-6, max_iter=5000, initial_value=0) elif algo == 'q': solver = mdp.QLearning(T, R, 0.99, alpha=1.0, alpha_decay=0.9999993, alpha_min=0.1, epsilon=1.0,
def run_forest(): np.random.seed(0) P, R = example.forest(S=5, r1=3, r2=15, p=0.2) print("Transition Array: ") print(P.shape) print(P) # Transition array A x S x S print("Reward Array: ") print(R.shape) print(R) # Reward array S x A # TODO gamma_range = np.array([0.1, 0.9, 0.99]) alpha_range = np.array([0.01, 0.5, 0.99]) epsilon_range = np.array([0.1, 0.5, 0.95]) e_decay_range = np.array([0.1, 0.5, 0.999]) # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9)) # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4)) # epsilon_range = np.linspace(0.1, 1.0, 10) # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9)) difference_list = np.zeros(gamma_range.shape) value_iteration_list = np.zeros(gamma_range.shape) value_time_list = np.zeros(gamma_range.shape) value_reward_list = np.zeros(gamma_range.shape) value_error_list = np.zeros(gamma_range.shape) policy_iteration_list = np.zeros(gamma_range.shape) policy_time_list = np.zeros(gamma_range.shape) policy_reward_list = np.zeros(gamma_range.shape) policy_error_list = np.zeros(gamma_range.shape) for i, gamma in enumerate(gamma_range): print('Gamma %0.2f' % gamma) vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000) vi.setVerbose() vi.run() vi_stats = vi.run_stats value_iteration_list[i] = vi_stats[-1:][0]['Iteration'] value_time_list[i] = vi_stats[-1:][0]['Time'] value_reward_list[i] = vi_stats[-1:][0]['Reward'] value_error_list[i] = vi_stats[-1:][0]['Error'] plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma)) pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1) pi.setVerbose() pi.run() stats = pi.run_stats policy_iteration_list[i] = stats[-1:][0]['Iteration'] policy_time_list[i] = stats[-1:][0]['Time'] policy_reward_list[i] = stats[-1:][0]['Reward'] policy_error_list[i] = stats[-1:][0]['Error'] plot_stats(stats, ('pi_forest_%0.2f' % gamma)) print('Policies Found') print('Value Iteration: ' + str(vi.policy)) print('Policy Iteration: ' + str(pi.policy)) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) difference_list[i] = difference1 print("Discrepancy in Policy and Value Iteration: ", difference1) print() # Plotting # Error v Iteration plt.clf() plt.title('Value Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(value_iteration_list), list(value_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_error_v_iteration.png') # Reward v Gamma plt.clf() plt.title('Value Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Value Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(value_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Value Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(value_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Value Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(value_iteration_list), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png') # Policy # Error v Iteration plt.clf() plt.title('Policy Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(policy_iteration_list), list(policy_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_error_v_iteration.png') # Gamma v Reward plt.clf() plt.title('Policy Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Policy Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(policy_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Policy Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(policy_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Policy Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(policy_iteration_list), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png') # Gamma vs Policy Differences plt.clf() plt.title('Gamma v Policy Differences') plt.xlabel('Gamma') plt.ylabel('Policy Differences') plt.plot(list(gamma_range), list(difference_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/gamma_v_differences.png') plt.close('all') prev_Q = None thresh = 1e-4 print('== Q Learning ==') for i, gamma in enumerate(gamma_range): for j, alpha in enumerate(alpha_range): for k, ep in enumerate(epsilon_range): for l, ed in enumerate(e_decay_range): # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed)) ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001, epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4) stats = ql.run() plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed))) # print('Policy: ') # print(ql.policy) # print(ql.run_stats) df = pd.DataFrame.from_records(ql.run_stats) iteration_list = df['Iteration'][-100:] windowed_reward = df['Reward'][-100:].mean() error_list = df['Error'][-100:].mean() if prev_Q is None: prev_Q = ql.Q else: variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max() res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q))) print('Result: ') print(res) print('Variation: ') print(variation) print('Mean Reward for Last 100 Iterations:') print(windowed_reward) if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0: print('Breaking! Below Thresh') print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format( gamma, alpha, ep, ed)) print('Optimal Policy: ') print(ql.policy) break
def run_forest(size): seed_val = 42 np.random.seed(seed_val) random.seed(seed_val) S = size r1 = 10 # The reward when the forest is in its oldest state and action ‘Wait’ is performed r2 = 50 # The reward when the forest is in its oldest state and action ‘Cut’ is performed p = 0.1 P, R = mdptoolbox.example.forest(S=S, r1=r1, r2=r2, p=p) # Defaults left the same epsilons = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001] gammas = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99, 0.999] gammas = [0.999] epsilons = [0.0001] per_won_hm = np.zeros((len(gammas), len(epsilons))) iters_hm = np.zeros((len(gammas), len(epsilons))) time_hm = np.zeros((len(gammas), len(epsilons))) best_rew = -1 best_pol_arr = [] g_cnt = 0 e_cnt = 0 for g in gammas: e_cnt = 0 print(g) best_pol = [] best_rew = -1 for e in epsilons: pi = mdp.PolicyIteration(P, R, gamma=g) pi.run() rew = run_episodes(pi.policy, S, R, p, 1000, 100) if rew > best_rew: best_rew = rew best_pol = pi.policy per_won_hm[g_cnt][e_cnt] = rew iters_hm[g_cnt][e_cnt] = pi.iter time_hm[g_cnt][e_cnt] = pi.time * 1000 e_cnt += 1 best_pol_arr.append(list(best_pol)) g_cnt += 1 mean_val = [i["Mean V"] for i in pi.run_stats] error = [i["Error"] for i in pi.run_stats] reward = [i["Reward"] for i in pi.run_stats] fig, ax = plt.subplots() ax.plot(mean_val, label='Mean V') ax.plot(error, label='Error') ax.plot(reward, label='Reward') ax.legend() plt.xlabel('Iterations', fontsize=15) plt.ylabel('V/Error/Reward', fontsize=15) plt.title("Mean V/Error/ Reward vs iterations") plt.show() op_list = [list(best_pol)] print(best_pol_arr) # Plot Percent Games Won Heatmap fig, ax = plt.subplots() im, cbar = heatmap(per_won_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="Average Reward") texts = annotate_heatmap(im, valfmt="{x:.0f}") fig.tight_layout() plt.savefig('Images\\PI-Forest-Per_Heatmap-' + str(size) + '.png') plt.show() # Plot Iterations Heatmap fig, ax = plt.subplots() im, cbar = heatmap(iters_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="# of Iterations to Convergence") texts = annotate_heatmap(im, valfmt="{x:.0f}") fig.tight_layout() plt.savefig('Images\\PI-Forest-Iter_Heatmap-' + str(size) + '.png') plt.show() # Plot Run time Heatmap fig, ax = plt.subplots() im, cbar = heatmap(time_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="Runtime (ms)") texts = annotate_heatmap(im, valfmt="{x:.0f}") fig.tight_layout() plt.savefig('Images\\PI-Forest-Time_Heatmap-' + str(size) + '.png') plt.show() # Plot out optimal policy # Citation: https://stackoverflow.com/questions/52566969/python-mapping-a-2d-array-to-a-grid-with-pyplot cmap = colors.ListedColormap(['blue', 'red']) fig, ax = plt.subplots(figsize=(12, 4)) plt.title("Forest PI Policy - Red = Cut, Blue = Wait") gammas.reverse() ax.set_yticklabels(gammas, fontsize=15) plt.xticks(fontsize=15) ax.tick_params(left=False) # remove the ticks plt.xlabel('State', fontsize=15) plt.ylabel('Gamma', fontsize=15) plt.pcolor(best_pol_arr[::-1], cmap=cmap, edgecolors='k', linewidths=0) plt.savefig('Images\\PI-Forest-Heatmap-' + str(size) + '.png') plt.show()
def vi_pi_q_comp(P, R): vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run() pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run() q = mdp.QLearning(P, R, 0.6, alpha=0.2).run() return vi, pi, q
def main(): print("Create a frozen lake of Size 10x10") p = generate_FrozenLake(size=10) num_states = len(p) num_actions = len(p[0]) print("Num of States:", num_states) print("Num of Actions:", num_actions) P = np.zeros((num_actions, num_states, num_states)) R = np.zeros((num_actions, num_states, num_states)) for i in range(num_states): for j in range(num_actions): sum = 0 for prob, next_state, rewards, done in p[i][j]: P[j][i][next_state] += prob R[j][i][next_state] = rewards sum += prob # VI for gamma in [.9, 0.6]: vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.000001, max_iter=5000) stats_data = vi.run() plot_mpd_graph( stats_data, 'VI Frozen_Lake(10x10), Gamma={}, Reward plot'.format(gamma), 'Reward', 'Reward') plot_mpd_graph( stats_data, 'VI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma), 'Time(seconds)', 'Time') # PI for gamma in [.9, 0.6]: print('PI {}'.format(gamma)) pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=5000, eval_type=1) stats_data = pi.run() plot_mpd_graph( stats_data, 'PI Frozen_Lake(10x10), Gamma={}, error plot'.format(gamma), 'Error', 'Error') plot_mpd_graph( stats_data, 'PI Frozen_Lake(10x10), Gamma={}, Time PLot'.format(gamma), 'Time(seconds)', 'Time') # QLearning for alpha in [0.1, 0.4]: qlearn = mdp.QLearning(transitions=P, reward=R, gamma=0.6, alpha=alpha, alpha_decay=0.1, alpha_min=0.0001, epsilon=0.1, epsilon_min=0.9, epsilon_decay=0, n_iter=10000) stats_data = qlearn.run() plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Error plot'.format( alpha), 'Error', 'Error') plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Reward plot'.format( alpha), 'Reward', 'Reward') plot_mpd_graph( stats_data, 'Qlearning Frozen_Lake(10x10), alpha={}, Time PLot'.format(alpha), 'Time(seconds)', 'Time')
def run(verbose=False): # MDP Forest Problem # transitions, reward = example.forest() nS = 1000 # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False) transitions, reward = example.forest(S=nS, r1=1045, r2=1025, p=0.01, is_sparse=False) # print(transitions) # print (reward) # return print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000) if verbose: pi.setVerbose() pi.run() util.print_debugs(pi) # print(pi.run_stats) # return print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000) if verbose: vi.setVerbose() vi.run() util.print_debugs(vi) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') # transitions, reward, gamma, # alpha=0.1, alpha_decay=0.99, alpha_min=0.001, # epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, # n_iter=10000, skip_check=False, iter_callback=None, # run_stat_frequency=None): ql = mdp.QLearning(transitions, reward, 0.75, alpha=0.3, epsilon_min=0.005, n_iter=500000) if verbose: ql.setVerbose() start_t = time.process_time() ql.run() end_t = time.process_time() # Output print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') util.print_debugs(pi) print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') util.print_debugs(vi) print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') print(ql.policy) print('Q-Learning # of Iterations: %i' % q_counter) print('Clock time') print(end_t - start_t) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') if (vi.policy == ql.policy): print('Forest – QL and VI Policies are the same!') else: print('Forest – QL and VI Policies are NOT the same.') if (pi.policy == ql.policy): print('Forest – PI and PI Policies are the same!') else: print('Forest – PI and VI Policies are NOT the same.') # A Q-Learning Algorithm # # Source: # https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/ """
def run(verbose=False): # env = gym.make('FrozenLake-v0', is_slippery=True) env = gym.make('FrozenLake8x8-v0', is_slippery=True) # env = gym.make('FrozenLake-v0') # Debug # print('env.P') # pprint(env.P) # print('env.R') # print(env.R) P, R = transform_for_MDPToolbox(env) # print('Reward') # print(R) # return print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Policy Iteration ~~~~~~~~~~') pi = mdp.PolicyIteration(P, R, 0.6, max_iter=100000) if verbose: pi.setVerbose() pi.run() util.print_debugs(pi) total_r_pi = render_env_policy(env, pi.policy, display=verbose) print('~~~~~~~~~~ FrozenLake-v0 – 4x4 Value Iteration ~~~~~~~~~~') vi = mdp.ValueIteration(P, R, 0.6, epsilon=0.005, max_iter=10000) if verbose: vi.setVerbose() vi.run() util.print_debugs(vi) total_r_vi = render_env_policy(env, pi.policy, display=verbose) if(vi.policy == pi.policy): print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are the same! ') else: print('FrozenLake-v0 4x4 - Value and Policy Iteration policies are NOT the same. ') print('~~~~~~~~~~ FrozenLake-v0 – Q-Learning ~~~~~~~~~~') ql = mdp.QLearning(P, R, 0.6, alpha=0.3, epsilon_min=0.005, n_iter=100000) if verbose: ql.setVerbose() start_t = time.process_time() ql.run() end_t = time.process_time() total_r_ql = render_env_policy(env, ql.policy, display=verbose) # Output print('~~~~~~~~~~ FrozenLake-v0 - Policy Iteration ~~~~~~~~~~') util.print_debugs(pi) print('Total Reward: %f' %total_r_pi) print('~~~~~~~~~~ FrozenLake-v0 - Value Iteration ~~~~~~~~~~') util.print_debugs(vi) print('Total Reward: %f' %total_r_vi) print('~~~~~~~~~~ FrozenLake-v0 - Q-Learning ~~~~~~~~~~') print('Clock time') print(end_t - start_t) print('Total Reward: %f' %total_r_pi) print(ql.policy) if(vi.policy == pi.policy): print('FrozenLake-v0 - Value and Policy Iteration policies are the same! ') else: print('FrozenLake-v0 - Value and Policy Iteration policies are NOT the same.') if(vi.policy == ql.policy): print('FrozenLake-v0 – QL and VI Policies are the same!') else: print('FrozenLake-v0 – QL and VI Policies are NOT the same.') if(pi.policy == ql.policy): print('FrozenLake-v0 – PI and PI Policies are the same!') else: print('FrozenLake-v0 – PI and VI Policies are NOT the same.') print('VI Policy') print_policy(vi.policy) # print('PI Policy') # print_policy(vi.policy) print('QL Policy') print_policy(ql.policy) # Source: # https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/ """
def getPlotsForGridWorldViPi(worlds, grid, starts, goals): iters = [] iter = range(1, 21, 1) iters.append(iter) iter = range(1, 41, 1) iters.append(iter) qlearningIter = [100000, 100000000] worldCntr = 1 for data in worlds: pi_rewards = [] pi_error = [] pi_time = [] pi_iter = [] vi_rewards = [] vi_error = [] vi_time = [] vi_iter = [] size = len(data) holesCoords = [] for row in range(0, data.shape[0]): for col in range(0, data.shape[1]): if data[row, col] == 1: # Obstacle holesCoords.append((row, col)) if data[row, col] == 2: # El roboto start = (row, col) if data[row, col] == 3: # Goal goal = (row, col) transitions, reward, discount, lake = get_environement( data, size, holesCoords, start, goal) for iter in iters[worldCntr - 1]: # Policy iteration policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=iter, eval_type=0) policy_iteration.run() print_as_grid(policy_iteration.policy, lake.lake, size) pi_rewards.append( policy_iteration.run_stats[len(policy_iteration.run_stats) - 1]['Reward']) pi_error.append( policy_iteration.run_stats[len(policy_iteration.run_stats) - 1]['Error']) pi_time.append( policy_iteration.run_stats[len(policy_iteration.run_stats) - 1]['Time']) pi_iter.append( policy_iteration.run_stats[len(policy_iteration.run_stats) - 1]['Iteration']) # Value iteration value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.001, max_iter=iter, initial_value=0) value_iteration.run() print_as_grid(value_iteration.policy, lake.lake, size) vi_rewards.append( value_iteration.run_stats[len(value_iteration.run_stats) - 1]['Reward']) vi_error.append( value_iteration.run_stats[len(value_iteration.run_stats) - 1]['Error']) vi_time.append( value_iteration.run_stats[len(value_iteration.run_stats) - 1]['Time']) vi_iter.append( value_iteration.run_stats[len(value_iteration.run_stats) - 1]['Iteration']) plt.style.use('seaborn-whitegrid') plt.plot(iters[worldCntr - 1], pi_error, label='PI') plt.plot(iters[worldCntr - 1], vi_error, label='VI') plt.ylabel('Convergence', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Convergence vs Iteration for Grid World no.' + str(worldCntr), fontsize=12, y=1.03) plt.legend() plt.savefig( 'Figures/Grid/Convergence vs Iteration for Grid World no.' + str(worldCntr) + '.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(iters[worldCntr - 1], pi_rewards, label='PI') plt.plot(iters[worldCntr - 1], vi_rewards, label='VI') plt.ylabel('Reward', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Reward vs Iteration for Grid World no.' + str(worldCntr), fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Grid/Reward vs Iteration for Grid World no.' + str(worldCntr) + '.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(iters[worldCntr - 1], pi_time, label='PI') plt.plot(iters[worldCntr - 1], vi_time, label='VI') plt.ylabel('Time', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Time vs Iteration for Grid World no.' + str(worldCntr), fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Grid/Time vs Iteration for Grid World no.' + str(worldCntr) + '.png') plt.close() worldCntr += 1
def findBestPolicyForForest(): cntr = 0 pi_rewards = [] pi_error = [] pi_time = [] pi_iter = [] vi_rewards = [] vi_error = [] vi_time = [] vi_iter = [] for size in [1000]: forest = ForestMng(states=size, reward_wait=4, reward_cut=2, prob_fire=0.3) # Policy iteration policy_iteration = mdp.PolicyIteration(forest.P, forest.R, gamma=0.9, policy0=None, max_iter=1000, eval_type=0) policy_iteration.run() print(policy_iteration.time) print(policy_iteration.iter) print(policy_iteration.policy) pi_rewards.append([sub['Reward'] for sub in policy_iteration.run_stats]) pi_error.append([ sub['Error'] for sub in policy_iteration.run_stats ]) pi_time.append([ sub['Time'] for sub in policy_iteration.run_stats ]) pi_iter.append([ sub['Iteration'] for sub in policy_iteration.run_stats ]) # Value iteration value_iteration = mdp.ValueIteration(forest.P, forest.R, gamma=0.9, max_iter=1000) value_iteration.run() print(value_iteration.time) print(value_iteration.iter) print(value_iteration.policy) vi_rewards.append([sub['Reward'] for sub in value_iteration.run_stats]) vi_error.append([sub['Error'] for sub in value_iteration.run_stats]) vi_time.append([sub['Time'] for sub in value_iteration.run_stats]) vi_iter.append([sub['Iteration'] for sub in value_iteration.run_stats]) if max(pi_iter[cntr]) < max(vi_iter[cntr]): for i in range(max(vi_iter[cntr]) - max(pi_iter[cntr])): pi_error[cntr].append(pi_error[cntr][len(pi_error[cntr])-1]) pi_rewards[cntr].append(pi_rewards[cntr][len(pi_rewards[cntr]) - 1]) pi_time[cntr].append(pi_time[cntr][len(pi_time[cntr]) - 1]) cntr += 1 plt.style.use('seaborn-whitegrid') plt.plot(vi_iter[0], pi_error[0], label='PI') plt.plot(vi_iter[0], vi_error[0], label='VI') plt.ylabel('Convergence', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Convergence of Error vs Iteration for Forest Mng State 1000 p03', fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Forest/Error Convergence vs Iteration for Forest Mng state 1000 p03.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(vi_iter[0], pi_rewards[0], label='PI') plt.plot(vi_iter[0], vi_rewards[0], label='VI') plt.ylabel('Reward', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Rewards vs Iteration for Forest Mng state 1000 p03', fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Forest/Rewards vs Iteration for Forest Mng state 1000 p03.png') plt.close() plt.style.use('seaborn-whitegrid') plt.plot(vi_iter[0], pi_time[0], label='PI') plt.plot(vi_iter[0], vi_time[0], label='VI') plt.ylabel('Time', fontsize=12) plt.xlabel('Iter.', fontsize=12) plt.title('Time vs Iteration for Forest Mng state 1000 p03', fontsize=12, y=1.03) plt.legend() plt.savefig('Figures/Forest/Time vs Iteration for Forest Mng state 1000 p3.png') plt.close()
def frozen_lake_all(P, R, gamma_range, mapping, shape): vi_iteration_list = np.zeros(gamma_range.shape) vi_time_list = np.zeros(gamma_range.shape) vi_reward_list = np.zeros(gamma_range.shape) vi_error_list = np.zeros(gamma_range.shape) pi_iteration_list = np.zeros(gamma_range.shape) pi_time_list = np.zeros(gamma_range.shape) pi_reward_list = np.zeros(gamma_range.shape) pi_error_list = np.zeros(gamma_range.shape) diff_list = np.zeros(gamma_range.shape) expected_policy = None for i, gamma in enumerate(gamma_range): print('Gamma %0.2f' % gamma) vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=5000) # vi.setVerbose() vi.run() vi_iteration_list[i] = vi.run_stats[-1:][0]['Iteration'] vi_time_list[i] = vi.run_stats[-1:][0]['Time'] vi_reward_list[i] = vi.run_stats[-1:][0]['Reward'] vi_error_list[i] = vi.run_stats[-1:][0]['Error'] pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=5000, eval_type=1) # pi.setVerbose() pi.run() pi_iteration_list[i] = pi.run_stats[-1:][0]['Iteration'] pi_time_list[i] = pi.run_stats[-1:][0]['Time'] pi_reward_list[i] = pi.run_stats[-1:][0]['Reward'] pi_error_list[i] = pi.run_stats[-1:][0]['Error'] print('Value Iteration Policy Found: ' + str(vi.policy)) print_policy(vi.policy, mapping, shape) print('Policy Iteration Policy Found: ' + str(pi.policy)) print_policy(pi.policy, mapping, shape) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) diff_list[i] = difference1 print('Discrepancy in Policy and Value Iteration: ', difference1) if difference1 == 0: expected_policy = vi.policy print() # Plotting # Error v Iteration plt.clf() plt.title('Value Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(vi_iteration_list), list(vi_error_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_error_v_iteration.png') # Reward v Gamma plt.clf() plt.title('Value Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(vi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Value Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(vi_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Value Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(vi_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Value Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(vi_iteration_list), list(vi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/vi_reward_v_iterations.png') # Policy # Error v Iteration plt.clf() plt.title('Policy Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.scatter(list(pi_iteration_list), list(pi_error_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_error_v_iteration.png') # Gamma v Reward plt.clf() plt.title('Policy Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.scatter(list(gamma_range), list(pi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Policy Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.scatter(list(pi_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Policy Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.scatter(list(pi_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Policy Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.scatter(list(pi_iteration_list), list(pi_reward_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/pi_reward_v_iterations.png') # Gamma vs Policy Differences plt.clf() plt.title('Gamma v Policy Differences') plt.xlabel('Gamma') plt.ylabel('Policy Differences') plt.scatter(list(gamma_range), list(diff_list)) plt.tight_layout() plt.savefig('plots/frozen_lakes/gamma_v_differences.png') # TODO gamma_range = np.array([0.8, 0.9, 0.99]) alpha_range = np.array([0.1, 0.9, 0.99]) epsilon_range = np.array([0.1, 0.5, 0.9, 0.999]) e_decay_range = np.array([0.1, 0.5, 0.9, 0.999]) # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4)) # epsilon_range = np.linspace(0.1, 1.0, 10) # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9)) prev_Q = None thresh = 1e-4 print('== Q Learning ==') for i, gamma in enumerate(gamma_range): for j, alpha in enumerate(alpha_range): for k, ep in enumerate(epsilon_range): for l, ed in enumerate(e_decay_range): # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed)) ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001, epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4) stats = ql.run() plot_stats(stats, ('ql_frozen_lake_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed))) # print('Policy: ') # print(ql.policy) # print(ql.run_stats) df = pd.DataFrame.from_records(ql.run_stats) iteration_list = df['Iteration'][-100:] windowed_reward = df['Reward'][-100:].mean() error_list = df['Error'][-100:].mean() if prev_Q is None: prev_Q = ql.Q else: variation = np.absolute( np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max() res = np.abs( np.subtract(np.asarray(prev_Q), np.asarray(ql.Q))) print('Result: ') print(res) print('Variation: ') print(variation) print('Mean Reward for Last 100 Iterations:') print(windowed_reward) if np.all( res < thresh ) or variation < thresh or windowed_reward > 45.0: print('Breaking! Below Thresh') print( 'Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}' .format(gamma, alpha, ep, ed)) print('Optimal Policy: ') print(ql.policy) break