def run(num_states_to_remove: int): estop_map = np.copy(lake_map) estop_map[rank_hp2d < num_states_to_remove] = "E" # print(num_states_to_remove) # print(estop_map) # Check that we haven't gotten rid of the start state yet. if (estop_map == "S").sum() == 0: return None estop_env = build_env(frozenlake.Lake(estop_map)) _, policy_rewards_per_iter = frozenlake.value_iteration( estop_env, gamma, max_iterations=5000, ) # plt.figure() # viz.plot_heatmap(frozenlake.Lake(estop_map), np.max(estop_state_action_values, axis=-1)) # plt.title(f"V(s) with {num_states_to_remove} states removed") # plt.show() num_states = frozenlake.num_mdp_states(estop_map) # There are 4 S * A * S FLOPS in each iteration: # * multiplying transitions with state_values # * multiplying times gamma # * adding expected_rewards # * max'ing over state_action_values flops_per_iter = 4 * (frozenlake.NUM_ACTIONS * (num_states**2)) * np.arange( len(policy_rewards_per_iter)) return flops_per_iter, policy_rewards_per_iter
def estop_map_optimal_policy_value(hp): # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy. rank_hp2d = lake.reshape(np.argsort(np.argsort(hp))) estop_map = np.copy(lake_map) estop_map[rank_hp2d < num_states_to_remove] = "E" # Check that we haven't gotten rid of the start state yet. if (estop_map == "S").sum() == 0: # This could also be recorded as zero, depending on how you want to think # about it. return None estop_env = build_env(frozenlake.Lake(estop_map)) return frozenlake.optimal_policy_reward(estop_env, gamma)
import matplotlib.pyplot as plt import tqdm import numpy as np from research.estop.frozenlake import frozenlake def build_env(l: frozenlake.Lake): return frozenlake.FrozenLakeWithEscapingEnv(l, hole_retention_probability=0.99) if __name__ == "__main__": np.random.seed(0) lake_map = frozenlake.MAP_8x8 gamma = 0.99 lake = frozenlake.Lake(lake_map) env = build_env(lake) num_states_to_remove = 0.5 * lake.num_states num_random_policies = 1024 Q, _ = frozenlake.value_iteration(env, gamma, tolerance=1e-6) def estop_map_optimal_policy_value(hp): # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy. rank_hp2d = lake.reshape(np.argsort(np.argsort(hp))) estop_map = np.copy(lake_map) estop_map[rank_hp2d < num_states_to_remove] = "E" # Check that we haven't gotten rid of the start state yet. if (estop_map == "S").sum() == 0:
def main(): # pylint: disable=too-many-statements np.random.seed(0) lake_map = frozenlake.MAP_8x8 gamma = 0.99 lake = frozenlake.Lake(lake_map) env = build_env(lake) state_action_values, policy_rewards_per_iter = frozenlake.value_iteration( env, gamma, tolerance=1e-6) policy_actions = np.argmax(state_action_values, axis=-1) state_values = np.max(state_action_values, axis=-1) # Show value function map. plt.figure() viz.plot_heatmap(lake, state_values) # plt.title("FrozenLake-v0 environment") plt.tick_params( axis="both", which="both", bottom=False, top=False, left=False, right=False, labelbottom=False, labeltop=False, labelleft=False, labelright=False, ) plt.tight_layout() plt.savefig("figs/value_function_full_env.pdf") # Show hitting probability map. policy_transitions = np.array([ env.transitions[i, policy_actions[i], :] for i in range(lake.num_states) ]) hp, esta = frozenlake.markov_chain_stats(env, policy_transitions) hp2d = lake.reshape(hp) plt.figure() viz.plot_heatmap(lake, hp) plt.title("Hitting probabilities") plt.savefig("figs/hitting_probabilities.pdf") # Show estimated hitting probability map. estimated_hp = frozenlake.estimate_hitting_probabilities( env, frozenlake.deterministic_policy(env, policy_actions), num_rollouts=1000) plt.figure() viz.plot_heatmap(lake, estimated_hp) plt.title("Estimated hitting probabilities") plt.figure() viz.plot_heatmap(lake, esta) plt.title("Expected number of states to completion") # Show optimal policy on top of hitting probabilities. plt.figure() im = plt.imshow(hp2d) for s, a in zip(lake.ij_states, policy_actions): i, j = s if a == 0: arrow = "←" elif a == 1: arrow = "↓" elif a == 2: arrow = "→" elif a == 3: arrow = "↑" else: raise Exception("bad bad bad") im.axes.text(j, i, arrow, { "horizontalalignment": "center", "verticalalignment": "center" }) plt.title("Optimal policy overlayed on hitting probabilities") plt.savefig("figs/optimal_policy.pdf") # Show value CDF. plt.figure() plt.hist(state_values, bins=100, histtype="step", cumulative=True) plt.xlabel("V(s)") plt.ylabel(f"Number of states (out of {lake.num_states})") plt.title("CDF of state values") plt.savefig("figs/value_function_cdf.pdf") ####### # New map has hole everywhere with bad prob. estop_map = np.copy(lake_map) percentile = 50 threshold = np.percentile(estimated_hp, percentile) # Use less than or equal because the estimated hitting probabilities can be # zero and the threshold can be zero, so nothing on the map changes. estop_map[lake.reshape(estimated_hp) <= threshold] = "E" estop_lake = frozenlake.Lake(estop_map) estop_env = build_env(estop_lake) estop_state_action_values, estop_policy_rewards_per_iter = frozenlake.value_iteration( estop_env, gamma, tolerance=1e-6) estop_state_values = np.max(estop_state_action_values, axis=-1) # Show value function map. plt.figure() viz.plot_heatmap(estop_lake, estop_state_values) plt.title(f"E-stop map ({percentile}% of states removed)") plt.savefig("figs/estop_map.pdf") # Show policy rewards per iter # There are 4 S * A * S FLOPS in each iteration: # * multiplying transitions with state_values # * multiplying times gamma # * adding expected_rewards # * max'ing over state_action_values plt.figure() plt.plot( 4 * (frozenlake.NUM_ACTIONS * (frozenlake.num_mdp_states(lake_map)**2)) * np.arange(len(policy_rewards_per_iter)), policy_rewards_per_iter) plt.plot( 4 * (frozenlake.NUM_ACTIONS * (frozenlake.num_mdp_states(estop_map)**2)) * np.arange(len(estop_policy_rewards_per_iter)), estop_policy_rewards_per_iter) plt.xlabel("FLOPS") plt.ylabel("Policy reward") plt.legend(["Full MDP", "E-stop MDP"]) plt.title("Convergence comparison") plt.savefig("figs/convergence_comparison.pdf") print( f"Exact solution, policy value: {np.dot(env.initial_state_distribution, state_values)}" ) print( f"E-stop solution, policy value: {np.dot(env.initial_state_distribution, estop_state_values)}" ) plt.show()
def main(): np.random.seed(0) def build_env(lake: frozenlake.Lake): # return frozenlake.FrozenLakeEnv(lake, infinite_time=True) return frozenlake.FrozenLakeWithEscapingEnv( lake, hole_retention_probability=0.99) lake_map = frozenlake.MAP_8x8 policy_evaluation_frequency = 10 gamma = 0.99 num_random_seeds = 96 results_dir = Path("results/frozenlake_qlearning") estop_results_dir = results_dir / "estop" full_results_dir = results_dir / "full" results_dir.mkdir() estop_results_dir.mkdir() full_results_dir.mkdir() # Build the full environment and run value iteration to calculate the optimal # policy. lake = frozenlake.Lake(lake_map) env = build_env(lake) state_action_values, _ = frozenlake.value_iteration(env, gamma, tolerance=1e-6) state_values = np.max(state_action_values, axis=-1) optimal_policy_reward = np.dot(state_values, env.initial_state_distribution) # Estimate hitting probabilities. optimal_policy = frozenlake.deterministic_policy( env, np.argmax(state_action_values, axis=-1)) estimated_hp = frozenlake.estimate_hitting_probabilities(env, optimal_policy, num_rollouts=1000) estimated_hp2d = lake.reshape(estimated_hp) # Build e-stop environment. estop_map = np.copy(lake_map) percentile = 50 threshold = np.percentile(estimated_hp, percentile) estop_map[estimated_hp2d <= threshold] = "E" estop_lake = frozenlake.Lake(estop_map) estop_env = build_env(estop_lake) # pickle dump the environemnt setup/metadata... pickle.dump( { "lake_map": lake_map, "policy_evaluation_frequency": policy_evaluation_frequency, "gamma": gamma, "num_random_seeds": num_random_seeds, "lake": lake, "env": env, "state_action_values": state_action_values, "state_values": state_values, "optimal_policy_reward": optimal_policy_reward, "optimal_policy": optimal_policy, "estimated_hp": estimated_hp, "estimated_hp2d": estimated_hp2d, "estop_map": estop_map, "percentile": percentile, "threshold": threshold, "estop_lake": estop_lake, "estop_env": estop_env, }, (results_dir / "metadata.pkl").open(mode="wb")) pool = Pool() # Run Q-learning on the full environment. for _ in tqdm.tqdm(pool.imap_unordered( functools.partial( q_learning_job, env=env, gamma=gamma, policy_evaluation_frequency=policy_evaluation_frequency, folder=full_results_dir, ), range(num_random_seeds)), desc="full", total=num_random_seeds): pass # Run Q-learning on the e-stop environment. for _ in tqdm.tqdm(pool.imap_unordered( functools.partial( q_learning_job, env=estop_env, gamma=gamma, policy_evaluation_frequency=policy_evaluation_frequency, folder=estop_results_dir, ), range(num_random_seeds)), desc="estop", total=num_random_seeds): pass
def main(): np.random.seed(0) lake_map = frozenlake.MAP_8x8 gamma = 0.99 lake = frozenlake.Lake(lake_map) env = build_env(lake) state_action_values, _ = frozenlake.value_iteration(env, gamma, tolerance=1e-6) policy_actions = np.argmax(state_action_values, axis=-1) policy_transitions = np.array([ env.transitions[i, policy_actions[i], :] for i in range(lake.num_states) ]) hp, _ = frozenlake.markov_chain_stats(env, policy_transitions) # # Ensure that we don't remove the start state! # hp[lake.start_state] = 1.0 # # Show hitting probability map. # plt.figure() # viz.plot_heatmap(lake, hp) # plt.title("Hitting probabilities") # plt.show() # Estimated hitting probabilities # estimated_hp = frozenlake.estimate_hitting_probabilities( # env, # frozenlake.deterministic_policy(env, policy_actions), # num_rollouts=1000) # Show hitting probability map. # plt.figure() # viz.plot_heatmap(lake, estimated_hp) # plt.title("Estimated hitting probabilities") # plt.show() # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy. rank_hp2d = lake.reshape(np.argsort(np.argsort(hp))) def run(num_states_to_remove: int): estop_map = np.copy(lake_map) estop_map[rank_hp2d < num_states_to_remove] = "E" # print(num_states_to_remove) # print(estop_map) # Check that we haven't gotten rid of the start state yet. if (estop_map == "S").sum() == 0: return None estop_env = build_env(frozenlake.Lake(estop_map)) _, policy_rewards_per_iter = frozenlake.value_iteration( estop_env, gamma, max_iterations=5000, ) # plt.figure() # viz.plot_heatmap(frozenlake.Lake(estop_map), np.max(estop_state_action_values, axis=-1)) # plt.title(f"V(s) with {num_states_to_remove} states removed") # plt.show() num_states = frozenlake.num_mdp_states(estop_map) # There are 4 S * A * S FLOPS in each iteration: # * multiplying transitions with state_values # * multiplying times gamma # * adding expected_rewards # * max'ing over state_action_values flops_per_iter = 4 * (frozenlake.NUM_ACTIONS * (num_states**2)) * np.arange( len(policy_rewards_per_iter)) return flops_per_iter, policy_rewards_per_iter results = [run(i) for i in tqdm.trange(lake.num_states)] # Some of the maps don't have a feasible path to the goal so they're just zero # the whole time. noncrappy_results = [ i for i, res in enumerate(results) if res is not None and res[1][-1] > 0 ] plt.rcParams.update({"font.size": 16}) cmap = plt.get_cmap("YlOrRd") plt.figure() for i, ix in enumerate(noncrappy_results): plt.plot(results[ix][0] / 1000.0, results[ix][1], color=cmap(i / len(noncrappy_results))) plt.xlim(0, 5e3) plt.xlabel("FLOPs (thousands)") plt.ylabel("Cumulative policy reward") colorbar = plt.colorbar( matplotlib.cm.ScalarMappable( cmap=cmap, norm=matplotlib.colors.Normalize( vmin=0, vmax=100 * max(noncrappy_results) / lake.num_states))) colorbar.set_label("E-stop states (%)", rotation=270, labelpad=25) plt.tight_layout() plt.savefig("figs/value_iteration_sweep.pdf") plt.figure() plt.plot(100 * np.array(noncrappy_results) / lake.num_states, [results[i][1][-1] for i in noncrappy_results]) plt.xlabel("States removed (%)") plt.ylabel("Optimal policy cumulative reward") plt.tight_layout() plt.savefig("figs/num_removed_vs_policy_reward.pdf")
def main(): np.random.seed(0) # lake_map = frozenlake.MAP_CORRIDOR_4x1 lake_map = frozenlake.MAP_8x8 policy_evaluation_frequency = 100 gamma = 0.99 lake = frozenlake.Lake(lake_map) env = build_env(lake) print( f"Optimal policy reward on full env: {frozenlake.optimal_policy_reward(env, gamma)}" ) # Estimate hitting probabilities. state_action_values, _ = frozenlake.value_iteration( env, gamma, tolerance=1e-6, ) optimal_policy = frozenlake.deterministic_policy( env, np.argmax(state_action_values, axis=-1)) estimated_hp = frozenlake.estimate_hitting_probabilities( env, optimal_policy, num_rollouts=1000, ) estimated_hp2d = lake.reshape(estimated_hp) # Build e-stop environment. estop_map = np.copy(lake_map) percentile = 50 threshold = np.percentile(estimated_hp, percentile) estop_map[estimated_hp2d <= threshold] = "E" estop_lake = frozenlake.Lake(estop_map) estop_env = build_env(estop_lake) print( f"Optimal policy reward on e-stop: {frozenlake.optimal_policy_reward(estop_env, gamma)}" ) plt.figure() viz.plot_heatmap(estop_lake, np.zeros(estop_lake.num_states)) plt.title("E-stop map") plt.figure() viz.plot_heatmap(lake, np.zeros(lake.num_states)) plt.title("Full map") plt.show() plt.figure() for seed in range(1): np.random.seed(seed) x0 = 1e-2 * np.random.randn(estop_env.lake.num_states, frozenlake.NUM_ACTIONS) optimizer = optimizers.Adam(x0, learning_rate=1e-3) # optimizer = reinforce.Momentum(x0, learning_rate=1e-2, mass=0.0) states_seen, policy_rewards = reinforce.run_reinforce( estop_env, gamma, optimizer, num_episodes=50000, policy_evaluation_frequency=policy_evaluation_frequency) plt.plot(states_seen, policy_rewards) plt.axhline(frozenlake.optimal_policy_reward(env, gamma), color="grey", linestyle="--") plt.axhline(frozenlake.optimal_policy_reward(estop_env, gamma), color="grey", linestyle="--") plt.title(f"Learning rate={optimizer.learning_rate}") plt.show()