Пример #1
0
def REINFORCE(env, alpha, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha * (gamma ** t) * G_t * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)
        I = 1

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                       env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            v.weights += alpha_w * I  * delta * obs_vec
            policy.weights += alpha_th  * I * delta * policy.eligibility_vector(a,
            sa_pairs)
            I *= I
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Пример #3
0
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \
                                    gamma, n_episodes):

    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)
    z_th = np.zeros(env.observation_space_size * env.action_space_size)
    z_w = np.zeros(env.observation_space_size)
    R_bar = 0

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                        env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate(
                obs_vec)
            R_bar += eta * delta
            z_w = lambda_w * z_w + obs_vec
            z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs)
            v.weights += alpha_w * delta * z_w
            policy.weights += alpha_th * delta * z_th
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
Пример #4
0
def sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    q = LinearPolicy(env.observation_space_size * env.action_space_size, 0, \
                     env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        action = eps_greedy_policy_bin_features(q, obs, epsilon, \
                 env.observation_space_size, env.action_space_size)
        z = np.zeros(env.observation_space_size * env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(action)
            delta = reward
            sa_vec = encode_sa_pair(obs, action, env.observation_space_size, \
                                    env.action_space_size)
            idx_active = np.argwhere(sa_vec == 1)
            delta -= np.sum(q.weights[idx_active])
            # Accumulating traces.
            z[idx_active] += 1

            if done:
                # Update weights.
                q.weights += alpha * delta * z
            else:
                action_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \
                               env.observation_space_size, env.action_space_size)
                sa_prime_vec = encode_sa_pair(obs_prime, action_prime, \
                               env.observation_space_size, env.action_space_size)
                idx_active = np.argwhere(sa_prime_vec == 1)
                delta += gamma * np.sum(q.weights[idx_active])
                # Update weights.
                q.weights += alpha * delta * z
                # Update accumulating traces.
                z = gamma * lamda * z
                obs = obs_prime
                action = action_prime
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return q
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    returns = []
    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            x_t = encode_state(states[t], env.observation_space_size)
            delta = G_t - v.evaluate(x_t)
            v.weights += alpha_w * (gamma**t) * delta * x_t
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha_th * (gamma ** t) * G_t * delta * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        returns.append(sum(rewards[1:]))
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return (policy, np.array(returns))
def online_sarsa_lambda(env, lamda, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    q = LinearPolicy(env.observation_space_size * env.action_space_size, 0,
                     env.action_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_policy_bin_features(q, obs, epsilon, env.observation_space_size, \
                                           env.action_space_size)
        x = encode_sa_pair(obs, a, env.observation_space_size,
                           env.action_space_size)
        z = np.zeros(env.observation_space_size * env.action_space_size)
        Q_old = 0

        while not done:
            obs_prime, reward, done = env.step(a)
            a_prime = eps_greedy_policy_bin_features(q, obs_prime, epsilon, \
                      env.observation_space_size, env.action_space_size)
            x_prime = encode_sa_pair(obs_prime, a_prime, env.observation_space_size, \
                                     env.action_space_size)
            Q = q.evaluate(x)
            Q_prime = q.evaluate(x_prime)
            delta = reward + gamma * Q_prime - Q
            # Update eligibility traces.
            z = gamma * lamda * z + (1 -
                                     alpha * gamma * lamda * np.dot(z, x)) * x
            # Update weights.
            q.weights += alpha * (delta + Q - Q_old) * z - alpha * (Q -
                                                                    Q_old) * x
            Q_old = Q
            x = x_prime
            a = a_prime
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return q
def test_policy(env, policy, n_tests):
    # MOve this function.
    import time
    input('Press any key to begin tests.')
    for i in range(n_tests):
        done = False
        obs = env.reset()
        env.render()
        time.sleep(0.3)
        while not done:
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.greedy_action(all_sa_pairs)
            obs, _, done = env.step(a)
            env.render()
            time.sleep(0.3)