def run(num_states_to_remove: int):
        estop_map = np.copy(lake_map)
        estop_map[rank_hp2d < num_states_to_remove] = "E"
        # print(num_states_to_remove)
        # print(estop_map)

        # Check that we haven't gotten rid of the start state yet.
        if (estop_map == "S").sum() == 0:
            return None

        estop_env = build_env(frozenlake.Lake(estop_map))
        _, policy_rewards_per_iter = frozenlake.value_iteration(
            estop_env,
            gamma,
            max_iterations=5000,
        )

        # plt.figure()
        # viz.plot_heatmap(frozenlake.Lake(estop_map), np.max(estop_state_action_values, axis=-1))
        # plt.title(f"V(s) with {num_states_to_remove} states removed")
        # plt.show()

        num_states = frozenlake.num_mdp_states(estop_map)
        # There are 4 S * A * S FLOPS in each iteration:
        #   * multiplying transitions with state_values
        #   * multiplying times gamma
        #   * adding expected_rewards
        #   * max'ing over state_action_values
        flops_per_iter = 4 * (frozenlake.NUM_ACTIONS *
                              (num_states**2)) * np.arange(
                                  len(policy_rewards_per_iter))
        return flops_per_iter, policy_rewards_per_iter
  def estop_map_optimal_policy_value(hp):
    # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy.
    rank_hp2d = lake.reshape(np.argsort(np.argsort(hp)))

    estop_map = np.copy(lake_map)
    estop_map[rank_hp2d < num_states_to_remove] = "E"

    # Check that we haven't gotten rid of the start state yet.
    if (estop_map == "S").sum() == 0:
      # This could also be recorded as zero, depending on how you want to think
      # about it.
      return None

    estop_env = build_env(frozenlake.Lake(estop_map))
    return frozenlake.optimal_policy_reward(estop_env, gamma)
import matplotlib.pyplot as plt
import tqdm
import numpy as np

from research.estop.frozenlake import frozenlake

def build_env(l: frozenlake.Lake):
  return frozenlake.FrozenLakeWithEscapingEnv(l, hole_retention_probability=0.99)

if __name__ == "__main__":
  np.random.seed(0)

  lake_map = frozenlake.MAP_8x8
  gamma = 0.99

  lake = frozenlake.Lake(lake_map)
  env = build_env(lake)
  num_states_to_remove = 0.5 * lake.num_states
  num_random_policies = 1024

  Q, _ = frozenlake.value_iteration(env, gamma, tolerance=1e-6)

  def estop_map_optimal_policy_value(hp):
    # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy.
    rank_hp2d = lake.reshape(np.argsort(np.argsort(hp)))

    estop_map = np.copy(lake_map)
    estop_map[rank_hp2d < num_states_to_remove] = "E"

    # Check that we haven't gotten rid of the start state yet.
    if (estop_map == "S").sum() == 0:
示例#4
0
def main():
    # pylint: disable=too-many-statements
    np.random.seed(0)

    lake_map = frozenlake.MAP_8x8
    gamma = 0.99

    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, policy_rewards_per_iter = frozenlake.value_iteration(
        env, gamma, tolerance=1e-6)
    policy_actions = np.argmax(state_action_values, axis=-1)
    state_values = np.max(state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(lake, state_values)
    # plt.title("FrozenLake-v0 environment")
    plt.tick_params(
        axis="both",
        which="both",
        bottom=False,
        top=False,
        left=False,
        right=False,
        labelbottom=False,
        labeltop=False,
        labelleft=False,
        labelright=False,
    )
    plt.tight_layout()
    plt.savefig("figs/value_function_full_env.pdf")

    # Show hitting probability map.
    policy_transitions = np.array([
        env.transitions[i, policy_actions[i], :]
        for i in range(lake.num_states)
    ])
    hp, esta = frozenlake.markov_chain_stats(env, policy_transitions)
    hp2d = lake.reshape(hp)

    plt.figure()
    viz.plot_heatmap(lake, hp)
    plt.title("Hitting probabilities")
    plt.savefig("figs/hitting_probabilities.pdf")

    # Show estimated hitting probability map.
    estimated_hp = frozenlake.estimate_hitting_probabilities(
        env,
        frozenlake.deterministic_policy(env, policy_actions),
        num_rollouts=1000)
    plt.figure()
    viz.plot_heatmap(lake, estimated_hp)
    plt.title("Estimated hitting probabilities")

    plt.figure()
    viz.plot_heatmap(lake, esta)
    plt.title("Expected number of states to completion")

    # Show optimal policy on top of hitting probabilities.
    plt.figure()
    im = plt.imshow(hp2d)
    for s, a in zip(lake.ij_states, policy_actions):
        i, j = s
        if a == 0:
            arrow = "←"
        elif a == 1:
            arrow = "↓"
        elif a == 2:
            arrow = "→"
        elif a == 3:
            arrow = "↑"
        else:
            raise Exception("bad bad bad")

        im.axes.text(j, i, arrow, {
            "horizontalalignment": "center",
            "verticalalignment": "center"
        })
    plt.title("Optimal policy overlayed on hitting probabilities")
    plt.savefig("figs/optimal_policy.pdf")

    # Show value CDF.
    plt.figure()
    plt.hist(state_values, bins=100, histtype="step", cumulative=True)
    plt.xlabel("V(s)")
    plt.ylabel(f"Number of states (out of {lake.num_states})")
    plt.title("CDF of state values")
    plt.savefig("figs/value_function_cdf.pdf")

    #######

    # New map has hole everywhere with bad prob.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    # Use less than or equal because the estimated hitting probabilities can be
    # zero and the threshold can be zero, so nothing on the map changes.
    estop_map[lake.reshape(estimated_hp) <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)
    estop_state_action_values, estop_policy_rewards_per_iter = frozenlake.value_iteration(
        estop_env, gamma, tolerance=1e-6)
    estop_state_values = np.max(estop_state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(estop_lake, estop_state_values)
    plt.title(f"E-stop map ({percentile}% of states removed)")
    plt.savefig("figs/estop_map.pdf")

    # Show policy rewards per iter
    # There are 4 S * A * S FLOPS in each iteration:
    #   * multiplying transitions with state_values
    #   * multiplying times gamma
    #   * adding expected_rewards
    #   * max'ing over state_action_values

    plt.figure()
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(lake_map)**2)) *
        np.arange(len(policy_rewards_per_iter)), policy_rewards_per_iter)
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(estop_map)**2)) *
        np.arange(len(estop_policy_rewards_per_iter)),
        estop_policy_rewards_per_iter)
    plt.xlabel("FLOPS")
    plt.ylabel("Policy reward")
    plt.legend(["Full MDP", "E-stop MDP"])
    plt.title("Convergence comparison")
    plt.savefig("figs/convergence_comparison.pdf")

    print(
        f"Exact solution, policy value: {np.dot(env.initial_state_distribution, state_values)}"
    )
    print(
        f"E-stop solution, policy value: {np.dot(env.initial_state_distribution, estop_state_values)}"
    )

    plt.show()
示例#5
0
def main():
    np.random.seed(0)

    def build_env(lake: frozenlake.Lake):
        # return frozenlake.FrozenLakeEnv(lake, infinite_time=True)
        return frozenlake.FrozenLakeWithEscapingEnv(
            lake, hole_retention_probability=0.99)

    lake_map = frozenlake.MAP_8x8
    policy_evaluation_frequency = 10
    gamma = 0.99
    num_random_seeds = 96

    results_dir = Path("results/frozenlake_qlearning")
    estop_results_dir = results_dir / "estop"
    full_results_dir = results_dir / "full"
    results_dir.mkdir()
    estop_results_dir.mkdir()
    full_results_dir.mkdir()

    # Build the full environment and run value iteration to calculate the optimal
    # policy.
    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, _ = frozenlake.value_iteration(env,
                                                        gamma,
                                                        tolerance=1e-6)
    state_values = np.max(state_action_values, axis=-1)
    optimal_policy_reward = np.dot(state_values,
                                   env.initial_state_distribution)

    # Estimate hitting probabilities.
    optimal_policy = frozenlake.deterministic_policy(
        env, np.argmax(state_action_values, axis=-1))
    estimated_hp = frozenlake.estimate_hitting_probabilities(env,
                                                             optimal_policy,
                                                             num_rollouts=1000)
    estimated_hp2d = lake.reshape(estimated_hp)

    # Build e-stop environment.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    estop_map[estimated_hp2d <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)

    # pickle dump the environemnt setup/metadata...
    pickle.dump(
        {
            "lake_map": lake_map,
            "policy_evaluation_frequency": policy_evaluation_frequency,
            "gamma": gamma,
            "num_random_seeds": num_random_seeds,
            "lake": lake,
            "env": env,
            "state_action_values": state_action_values,
            "state_values": state_values,
            "optimal_policy_reward": optimal_policy_reward,
            "optimal_policy": optimal_policy,
            "estimated_hp": estimated_hp,
            "estimated_hp2d": estimated_hp2d,
            "estop_map": estop_map,
            "percentile": percentile,
            "threshold": threshold,
            "estop_lake": estop_lake,
            "estop_env": estop_env,
        }, (results_dir / "metadata.pkl").open(mode="wb"))

    pool = Pool()

    # Run Q-learning on the full environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=full_results_dir,
            ), range(num_random_seeds)),
                       desc="full",
                       total=num_random_seeds):
        pass

    # Run Q-learning on the e-stop environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=estop_env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=estop_results_dir,
            ), range(num_random_seeds)),
                       desc="estop",
                       total=num_random_seeds):
        pass
def main():
    np.random.seed(0)

    lake_map = frozenlake.MAP_8x8
    gamma = 0.99

    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)

    state_action_values, _ = frozenlake.value_iteration(env,
                                                        gamma,
                                                        tolerance=1e-6)
    policy_actions = np.argmax(state_action_values, axis=-1)

    policy_transitions = np.array([
        env.transitions[i, policy_actions[i], :]
        for i in range(lake.num_states)
    ])
    hp, _ = frozenlake.markov_chain_stats(env, policy_transitions)

    # # Ensure that we don't remove the start state!
    # hp[lake.start_state] = 1.0

    # # Show hitting probability map.
    # plt.figure()
    # viz.plot_heatmap(lake, hp)
    # plt.title("Hitting probabilities")
    # plt.show()

    # Estimated hitting probabilities
    # estimated_hp = frozenlake.estimate_hitting_probabilities(
    #     env,
    #     frozenlake.deterministic_policy(env, policy_actions),
    #     num_rollouts=1000)

    # Show hitting probability map.
    # plt.figure()
    # viz.plot_heatmap(lake, estimated_hp)
    # plt.title("Estimated hitting probabilities")
    # plt.show()

    # See https://stackoverflow.com/questions/5284646/rank-items-in-an-array-using-python-numpy.
    rank_hp2d = lake.reshape(np.argsort(np.argsort(hp)))

    def run(num_states_to_remove: int):
        estop_map = np.copy(lake_map)
        estop_map[rank_hp2d < num_states_to_remove] = "E"
        # print(num_states_to_remove)
        # print(estop_map)

        # Check that we haven't gotten rid of the start state yet.
        if (estop_map == "S").sum() == 0:
            return None

        estop_env = build_env(frozenlake.Lake(estop_map))
        _, policy_rewards_per_iter = frozenlake.value_iteration(
            estop_env,
            gamma,
            max_iterations=5000,
        )

        # plt.figure()
        # viz.plot_heatmap(frozenlake.Lake(estop_map), np.max(estop_state_action_values, axis=-1))
        # plt.title(f"V(s) with {num_states_to_remove} states removed")
        # plt.show()

        num_states = frozenlake.num_mdp_states(estop_map)
        # There are 4 S * A * S FLOPS in each iteration:
        #   * multiplying transitions with state_values
        #   * multiplying times gamma
        #   * adding expected_rewards
        #   * max'ing over state_action_values
        flops_per_iter = 4 * (frozenlake.NUM_ACTIONS *
                              (num_states**2)) * np.arange(
                                  len(policy_rewards_per_iter))
        return flops_per_iter, policy_rewards_per_iter

    results = [run(i) for i in tqdm.trange(lake.num_states)]

    # Some of the maps don't have a feasible path to the goal so they're just zero
    # the whole time.
    noncrappy_results = [
        i for i, res in enumerate(results)
        if res is not None and res[1][-1] > 0
    ]

    plt.rcParams.update({"font.size": 16})
    cmap = plt.get_cmap("YlOrRd")

    plt.figure()
    for i, ix in enumerate(noncrappy_results):
        plt.plot(results[ix][0] / 1000.0,
                 results[ix][1],
                 color=cmap(i / len(noncrappy_results)))

    plt.xlim(0, 5e3)
    plt.xlabel("FLOPs (thousands)")
    plt.ylabel("Cumulative policy reward")
    colorbar = plt.colorbar(
        matplotlib.cm.ScalarMappable(
            cmap=cmap,
            norm=matplotlib.colors.Normalize(
                vmin=0, vmax=100 * max(noncrappy_results) / lake.num_states)))
    colorbar.set_label("E-stop states (%)", rotation=270, labelpad=25)
    plt.tight_layout()
    plt.savefig("figs/value_iteration_sweep.pdf")

    plt.figure()
    plt.plot(100 * np.array(noncrappy_results) / lake.num_states,
             [results[i][1][-1] for i in noncrappy_results])
    plt.xlabel("States removed (%)")
    plt.ylabel("Optimal policy cumulative reward")
    plt.tight_layout()
    plt.savefig("figs/num_removed_vs_policy_reward.pdf")
示例#7
0
def main():
  np.random.seed(0)

  # lake_map = frozenlake.MAP_CORRIDOR_4x1
  lake_map = frozenlake.MAP_8x8
  policy_evaluation_frequency = 100
  gamma = 0.99

  lake = frozenlake.Lake(lake_map)
  env = build_env(lake)
  print(
      f"Optimal policy reward on full env: {frozenlake.optimal_policy_reward(env, gamma)}"
  )

  # Estimate hitting probabilities.
  state_action_values, _ = frozenlake.value_iteration(
      env,
      gamma,
      tolerance=1e-6,
  )
  optimal_policy = frozenlake.deterministic_policy(
      env, np.argmax(state_action_values, axis=-1))
  estimated_hp = frozenlake.estimate_hitting_probabilities(
      env,
      optimal_policy,
      num_rollouts=1000,
  )
  estimated_hp2d = lake.reshape(estimated_hp)

  # Build e-stop environment.
  estop_map = np.copy(lake_map)
  percentile = 50
  threshold = np.percentile(estimated_hp, percentile)
  estop_map[estimated_hp2d <= threshold] = "E"

  estop_lake = frozenlake.Lake(estop_map)
  estop_env = build_env(estop_lake)
  print(
      f"Optimal policy reward on e-stop: {frozenlake.optimal_policy_reward(estop_env, gamma)}"
  )

  plt.figure()
  viz.plot_heatmap(estop_lake, np.zeros(estop_lake.num_states))
  plt.title("E-stop map")

  plt.figure()
  viz.plot_heatmap(lake, np.zeros(lake.num_states))
  plt.title("Full map")

  plt.show()

  plt.figure()
  for seed in range(1):
    np.random.seed(seed)

    x0 = 1e-2 * np.random.randn(estop_env.lake.num_states,
                                frozenlake.NUM_ACTIONS)
    optimizer = optimizers.Adam(x0, learning_rate=1e-3)
    # optimizer = reinforce.Momentum(x0, learning_rate=1e-2, mass=0.0)
    states_seen, policy_rewards = reinforce.run_reinforce(
        estop_env,
        gamma,
        optimizer,
        num_episodes=50000,
        policy_evaluation_frequency=policy_evaluation_frequency)

    plt.plot(states_seen, policy_rewards)

  plt.axhline(frozenlake.optimal_policy_reward(env, gamma),
              color="grey",
              linestyle="--")
  plt.axhline(frozenlake.optimal_policy_reward(estop_env, gamma),
              color="grey",
              linestyle="--")
  plt.title(f"Learning rate={optimizer.learning_rate}")
  plt.show()