Пример #1
0
def reinforce_episode(env,
                      gamma: float,
                      optimizer,
                      max_episode_length: Optional[int] = None):
    raw_policy = utils.softmax(optimizer.get(), axis=-1)
    # epsilon_greedy_policy = 0.9 * raw_policy + 0.1 * np.ones(
    #     (env.lake.num_states, frozenlake.NUM_ACTIONS)) / frozenlake.NUM_ACTIONS
    episode, final_state = frozenlake.rollout(
        env, policy=raw_policy, max_episode_length=max_episode_length)
    weighted_rewards = [(gamma**t) * r for t, (_, _, r) in enumerate(episode)]

    # pylint: disable=line-too-long
    # See https://stackoverflow.com/questions/16541618/perform-a-reverse-cumulative-sum-on-a-numpy-array.
    Gs = np.cumsum(weighted_rewards[::-1])[::-1]

    grad = np.zeros((env.lake.num_states, frozenlake.NUM_ACTIONS))

    for t, (state, action, _) in enumerate(episode):
        # Do this in-place for speeeeeed!
        grad[:, :] = 0.0
        grad[state, :] -= utils.softmax(optimizer.get()[state, :])
        grad[state, action] += 1.0
        grad *= Gs[t]

        optimizer.step(-grad)

    return episode, final_state
Пример #2
0
def actor_critic_episode(env,
                         gamma: float,
                         actor_optimizer,
                         critic_optimizer,
                         max_episode_length: Optional[int] = None):
    # Start off by sampling an initial state from the initial_state distribution.
    current_state = np.random.choice(env.lake.num_states,
                                     p=env.initial_state_distribution)
    episode = []

    actor_grad = np.zeros((env.lake.num_states, frozenlake.NUM_ACTIONS))
    critic_grad = np.zeros((env.lake.num_states, ))

    t = 0
    while (max_episode_length is None) or (max_episode_length is not None
                                           and t < max_episode_length):
        # Take a step.
        action_probs = utils.softmax(actor_optimizer.get()[current_state, :],
                                     axis=-1)
        action = np.random.choice(frozenlake.NUM_ACTIONS, p=action_probs)
        next_state = np.random.choice(env.lake.num_states,
                                      p=env.transitions[current_state,
                                                        action, :])
        reward = env.rewards[current_state, action, next_state]

        v = critic_optimizer.get()
        delta = reward + gamma * v[next_state] - v[current_state]

        # Calculate gradients
        actor_grad[:, :] = 0.0
        actor_grad[current_state, :] -= action_probs
        actor_grad[current_state, action] += 1.0
        actor_grad *= delta * (gamma**t)

        critic_grad[:] = 0.0
        critic_grad[current_state] = delta

        actor_optimizer.step(-actor_grad)
        critic_optimizer.step(-critic_grad)

        # Continue...
        episode.append((current_state, action, reward))
        current_state = next_state
        t += 1

        if current_state in env.terminal_states:
            break

    return episode, current_state
Пример #3
0
def run_reinforce(env,
                  gamma: float,
                  optimizer,
                  num_episodes: int,
                  policy_evaluation_frequency: int = 10,
                  verbose: bool = True):
    # We use this to warm start iterative policy evaluation.
    V = None

    states_seen = 0
    states_seen_log = []
    policy_rewards_log = []
    for episode_num in range(num_episodes):
        episode, _ = reinforce_episode(env,
                                       gamma,
                                       optimizer,
                                       max_episode_length=None)
        # print(f"episode length {len(episode)}")
        states_seen += len(episode)

        if episode_num % policy_evaluation_frequency == 0:
            policy = utils.softmax(optimizer.get(), axis=-1)
            V, _ = frozenlake.iterative_policy_evaluation(
                env,
                gamma,
                policy,
                tolerance=1e-6,
                init_V=V,
            )
            policy_reward = np.dot(V, env.initial_state_distribution)

            if verbose:
                print(f"Episode {episode_num}, policy reward: {policy_reward}")
                # print(optimizer.get())
                # print(utils.softmax(optimizer.get(), axis=-1))
                # print(1.0 * env.lake.reshape(
                #     np.argmax(policy, axis=-1) == deleteme_opt_policy))

            states_seen_log.append(states_seen)
            policy_rewards_log.append(policy_reward)

        # if (episode_num + 1) % 1000 == 0:
        #   plt.figure()
        #   viz.plot_heatmap(env, V)
        #   plt.title(f"Episode {episode_num}")
        #   plt.show()

    return states_seen_log, policy_rewards_log
Пример #4
0
def run_actor_critic(env,
                     gamma: float,
                     actor_optimizer,
                     critic_optimizer,
                     num_episodes: int,
                     policy_evaluation_frequency: int = 10,
                     verbose: bool = True):
    # We use this to warm start iterative policy evaluation.
    V = None

    states_seen = 0
    states_seen_log = []
    policy_rewards_log = []
    for episode_num in range(num_episodes):
        episode, _ = actor_critic_episode(env,
                                          gamma,
                                          actor_optimizer,
                                          critic_optimizer,
                                          max_episode_length=None)
        # print(f"episode length {len(episode)}")
        states_seen += len(episode)

        if episode_num % policy_evaluation_frequency == 0:
            policy = utils.softmax(actor_optimizer.get(), axis=-1)
            V, _ = frozenlake.iterative_policy_evaluation(
                env,
                gamma,
                policy,
                tolerance=1e-6,
                init_V=V,
            )
            policy_reward = np.dot(V, env.initial_state_distribution)

            if verbose:
                print(f"Episode {episode_num}, policy reward: {policy_reward}")
                # print(env.lake.reshape(critic_optimizer.get()))

            states_seen_log.append(states_seen)
            policy_rewards_log.append(policy_reward)

    return states_seen_log, policy_rewards_log