예제 #1
0
  def one(noise_scale: float):
    policy_actions = np.argmax(Q + noise_scale * np.random.randn(*Q.shape),
                               axis=-1)
    policy = frozenlake.deterministic_policy(env, policy_actions)
    V, _ = frozenlake.iterative_policy_evaluation(env,
                                                  gamma,
                                                  policy,
                                                  tolerance=1e-6)
    policy_value = np.dot(V, env.initial_state_distribution)

    # Calculate the value of the optimal policy in the exact e-stop environment.
    policy_transitions = np.array([
        env.transitions[i, policy_actions[i], :]
        for i in range(lake.num_states)
    ])
    try:
      exact_hp, _ = frozenlake.markov_chain_stats(env, policy_transitions)
    except np.linalg.LinAlgError:
      # Sometimes the policy is bad and one of the matrices ends up singular.
      return None

    estop_policy_value = estop_map_optimal_policy_value(exact_hp)
    if estop_policy_value is None:
      return None

    return policy_value, estop_policy_value
예제 #2
0
def run_q_learning(
    env: frozenlake.FrozenLakeEnv,
    gamma: float,
    num_episodes: int,
    policy_evaluation_frequency: int = 10,
    verbose: bool = True,
):
  # Initializing to random values is necessary to break ties, preventing the
  # agent from always picking the same action and never getting anywhere.
  Q = np.random.rand(env.lake.num_states, frozenlake.NUM_ACTIONS)

  # This is crucial! There is no positive or negative reward for taking any
  # action in a terminal state. See Sutton & Barto page 131.
  for s in env.terminal_states:
    # For the life of me, I don't understand why this disable is necessary. It
    # only seems necessary on circleci, even though the pylint version there is
    # exactly same as locally.
    # pylint: disable=unsupported-assignment-operation
    Q[s, :] = 0.0

  # We use this to warm start iterative policy evaluation.
  V = None

  states_seen = 0
  states_seen_log = []
  policy_rewards_log = []
  for episode_num in range(num_episodes):
    Q, episode, _ = q_learning_episode(
        env,
        gamma,
        alpha=0.1,
        Q=Q,
        meta_policy=epsilon_greedy(epsilon=0.1),
        # meta_policy=epsilon_greedy_annealed(epsilon=1.0),
        max_episode_length=None)
    states_seen += len(episode)

    if episode_num % policy_evaluation_frequency == 0:
      policy = frozenlake.deterministic_policy(env, np.argmax(Q, axis=-1))
      V, _ = frozenlake.iterative_policy_evaluation(env,
                                                    gamma,
                                                    policy,
                                                    tolerance=1e-6,
                                                    init_V=V)
      policy_reward = np.dot(V, env.initial_state_distribution)

      if verbose:
        print(f"Episode {episode_num}, policy reward: {policy_reward}")

      states_seen_log.append(states_seen)
      policy_rewards_log.append(policy_reward)

    # if (episode_num + 1) % 1000 == 0:
    #   V = np.max(Q, axis=-1)
    #   plt.figure()
    #   viz.plot_heatmap(env, V)
    #   plt.title(f"Episode {episode_num}")
    #   plt.show()

  return states_seen_log, policy_rewards_log
예제 #3
0
    def run(num_rollouts: int):
        policy_values = []
        while len(policy_values) < 64:
            # Estimated hitting probabilities
            estimated_hp = frozenlake.estimate_hitting_probabilities(
                env, frozenlake.deterministic_policy(env, policy_actions),
                num_rollouts)
            v = estop_map_optimal_policy_value(estimated_hp)
            if v is not None:
                policy_values.append(v)

        return policy_values
예제 #4
0
def main():
    # pylint: disable=too-many-statements
    np.random.seed(0)

    lake_map = frozenlake.MAP_8x8
    gamma = 0.99

    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, policy_rewards_per_iter = frozenlake.value_iteration(
        env, gamma, tolerance=1e-6)
    policy_actions = np.argmax(state_action_values, axis=-1)
    state_values = np.max(state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(lake, state_values)
    # plt.title("FrozenLake-v0 environment")
    plt.tick_params(
        axis="both",
        which="both",
        bottom=False,
        top=False,
        left=False,
        right=False,
        labelbottom=False,
        labeltop=False,
        labelleft=False,
        labelright=False,
    )
    plt.tight_layout()
    plt.savefig("figs/value_function_full_env.pdf")

    # Show hitting probability map.
    policy_transitions = np.array([
        env.transitions[i, policy_actions[i], :]
        for i in range(lake.num_states)
    ])
    hp, esta = frozenlake.markov_chain_stats(env, policy_transitions)
    hp2d = lake.reshape(hp)

    plt.figure()
    viz.plot_heatmap(lake, hp)
    plt.title("Hitting probabilities")
    plt.savefig("figs/hitting_probabilities.pdf")

    # Show estimated hitting probability map.
    estimated_hp = frozenlake.estimate_hitting_probabilities(
        env,
        frozenlake.deterministic_policy(env, policy_actions),
        num_rollouts=1000)
    plt.figure()
    viz.plot_heatmap(lake, estimated_hp)
    plt.title("Estimated hitting probabilities")

    plt.figure()
    viz.plot_heatmap(lake, esta)
    plt.title("Expected number of states to completion")

    # Show optimal policy on top of hitting probabilities.
    plt.figure()
    im = plt.imshow(hp2d)
    for s, a in zip(lake.ij_states, policy_actions):
        i, j = s
        if a == 0:
            arrow = "←"
        elif a == 1:
            arrow = "↓"
        elif a == 2:
            arrow = "→"
        elif a == 3:
            arrow = "↑"
        else:
            raise Exception("bad bad bad")

        im.axes.text(j, i, arrow, {
            "horizontalalignment": "center",
            "verticalalignment": "center"
        })
    plt.title("Optimal policy overlayed on hitting probabilities")
    plt.savefig("figs/optimal_policy.pdf")

    # Show value CDF.
    plt.figure()
    plt.hist(state_values, bins=100, histtype="step", cumulative=True)
    plt.xlabel("V(s)")
    plt.ylabel(f"Number of states (out of {lake.num_states})")
    plt.title("CDF of state values")
    plt.savefig("figs/value_function_cdf.pdf")

    #######

    # New map has hole everywhere with bad prob.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    # Use less than or equal because the estimated hitting probabilities can be
    # zero and the threshold can be zero, so nothing on the map changes.
    estop_map[lake.reshape(estimated_hp) <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)
    estop_state_action_values, estop_policy_rewards_per_iter = frozenlake.value_iteration(
        estop_env, gamma, tolerance=1e-6)
    estop_state_values = np.max(estop_state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(estop_lake, estop_state_values)
    plt.title(f"E-stop map ({percentile}% of states removed)")
    plt.savefig("figs/estop_map.pdf")

    # Show policy rewards per iter
    # There are 4 S * A * S FLOPS in each iteration:
    #   * multiplying transitions with state_values
    #   * multiplying times gamma
    #   * adding expected_rewards
    #   * max'ing over state_action_values

    plt.figure()
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(lake_map)**2)) *
        np.arange(len(policy_rewards_per_iter)), policy_rewards_per_iter)
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(estop_map)**2)) *
        np.arange(len(estop_policy_rewards_per_iter)),
        estop_policy_rewards_per_iter)
    plt.xlabel("FLOPS")
    plt.ylabel("Policy reward")
    plt.legend(["Full MDP", "E-stop MDP"])
    plt.title("Convergence comparison")
    plt.savefig("figs/convergence_comparison.pdf")

    print(
        f"Exact solution, policy value: {np.dot(env.initial_state_distribution, state_values)}"
    )
    print(
        f"E-stop solution, policy value: {np.dot(env.initial_state_distribution, estop_state_values)}"
    )

    plt.show()
예제 #5
0
def main():
    np.random.seed(0)

    def build_env(lake: frozenlake.Lake):
        # return frozenlake.FrozenLakeEnv(lake, infinite_time=True)
        return frozenlake.FrozenLakeWithEscapingEnv(
            lake, hole_retention_probability=0.99)

    lake_map = frozenlake.MAP_8x8
    policy_evaluation_frequency = 10
    gamma = 0.99
    num_random_seeds = 96

    results_dir = Path("results/frozenlake_qlearning")
    estop_results_dir = results_dir / "estop"
    full_results_dir = results_dir / "full"
    results_dir.mkdir()
    estop_results_dir.mkdir()
    full_results_dir.mkdir()

    # Build the full environment and run value iteration to calculate the optimal
    # policy.
    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, _ = frozenlake.value_iteration(env,
                                                        gamma,
                                                        tolerance=1e-6)
    state_values = np.max(state_action_values, axis=-1)
    optimal_policy_reward = np.dot(state_values,
                                   env.initial_state_distribution)

    # Estimate hitting probabilities.
    optimal_policy = frozenlake.deterministic_policy(
        env, np.argmax(state_action_values, axis=-1))
    estimated_hp = frozenlake.estimate_hitting_probabilities(env,
                                                             optimal_policy,
                                                             num_rollouts=1000)
    estimated_hp2d = lake.reshape(estimated_hp)

    # Build e-stop environment.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    estop_map[estimated_hp2d <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)

    # pickle dump the environemnt setup/metadata...
    pickle.dump(
        {
            "lake_map": lake_map,
            "policy_evaluation_frequency": policy_evaluation_frequency,
            "gamma": gamma,
            "num_random_seeds": num_random_seeds,
            "lake": lake,
            "env": env,
            "state_action_values": state_action_values,
            "state_values": state_values,
            "optimal_policy_reward": optimal_policy_reward,
            "optimal_policy": optimal_policy,
            "estimated_hp": estimated_hp,
            "estimated_hp2d": estimated_hp2d,
            "estop_map": estop_map,
            "percentile": percentile,
            "threshold": threshold,
            "estop_lake": estop_lake,
            "estop_env": estop_env,
        }, (results_dir / "metadata.pkl").open(mode="wb"))

    pool = Pool()

    # Run Q-learning on the full environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=full_results_dir,
            ), range(num_random_seeds)),
                       desc="full",
                       total=num_random_seeds):
        pass

    # Run Q-learning on the e-stop environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=estop_env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=estop_results_dir,
            ), range(num_random_seeds)),
                       desc="estop",
                       total=num_random_seeds):
        pass
예제 #6
0
def main():
  np.random.seed(0)

  # lake_map = frozenlake.MAP_CORRIDOR_4x1
  lake_map = frozenlake.MAP_8x8
  policy_evaluation_frequency = 100
  gamma = 0.99

  lake = frozenlake.Lake(lake_map)
  env = build_env(lake)
  print(
      f"Optimal policy reward on full env: {frozenlake.optimal_policy_reward(env, gamma)}"
  )

  # Estimate hitting probabilities.
  state_action_values, _ = frozenlake.value_iteration(
      env,
      gamma,
      tolerance=1e-6,
  )
  optimal_policy = frozenlake.deterministic_policy(
      env, np.argmax(state_action_values, axis=-1))
  estimated_hp = frozenlake.estimate_hitting_probabilities(
      env,
      optimal_policy,
      num_rollouts=1000,
  )
  estimated_hp2d = lake.reshape(estimated_hp)

  # Build e-stop environment.
  estop_map = np.copy(lake_map)
  percentile = 50
  threshold = np.percentile(estimated_hp, percentile)
  estop_map[estimated_hp2d <= threshold] = "E"

  estop_lake = frozenlake.Lake(estop_map)
  estop_env = build_env(estop_lake)
  print(
      f"Optimal policy reward on e-stop: {frozenlake.optimal_policy_reward(estop_env, gamma)}"
  )

  plt.figure()
  viz.plot_heatmap(estop_lake, np.zeros(estop_lake.num_states))
  plt.title("E-stop map")

  plt.figure()
  viz.plot_heatmap(lake, np.zeros(lake.num_states))
  plt.title("Full map")

  plt.show()

  plt.figure()
  for seed in range(1):
    np.random.seed(seed)

    x0 = 1e-2 * np.random.randn(estop_env.lake.num_states,
                                frozenlake.NUM_ACTIONS)
    optimizer = optimizers.Adam(x0, learning_rate=1e-3)
    # optimizer = reinforce.Momentum(x0, learning_rate=1e-2, mass=0.0)
    states_seen, policy_rewards = reinforce.run_reinforce(
        estop_env,
        gamma,
        optimizer,
        num_episodes=50000,
        policy_evaluation_frequency=policy_evaluation_frequency)

    plt.plot(states_seen, policy_rewards)

  plt.axhline(frozenlake.optimal_policy_reward(env, gamma),
              color="grey",
              linestyle="--")
  plt.axhline(frozenlake.optimal_policy_reward(estop_env, gamma),
              color="grey",
              linestyle="--")
  plt.title(f"Learning rate={optimizer.learning_rate}")
  plt.show()