def semi_gradient_n_step_sarsa(env, n, alpha, gamma, epsilon, n_episodes, \
                               tile_coder, action_len, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_len, env.action_space_size)
    states = [None] * n
    actions = np.zeros(n)
    rewards = np.zeros(n)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = obs
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder,\
                                   env.action_space_size)
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T-1:
            if t < T:
                obs_prime, reward, done = env.step(a)
                rewards[(t+1)%n] = reward
                states[(t+1)%n] = obs_prime
                if done:
                    T = t+1
                else:
                    a = eps_greedy_func_policy(q, obs_prime, epsilon, \
                        tile_coder, env.action_space_size)
                    actions[(t+1)%n] = a
            tau = t-n+1
            if tau > -1:
                # Calculate n-step return.
                G = np.sum([gamma**(i-tau-1)*rewards[i%n] \
                    for i in range(tau+1, min(tau+n,T))])
                if tau + n < T:
                    s = states[(tau+n)%n]
                    a = actions[(tau+n)%n]
                    x = tile_coder.get_feature_vector(s, a)
                    G += gamma**n * q.evaluate(x)
                s = states[tau%n]
                a = actions[tau%n]
                x = tile_coder.get_feature_vector(s, a)
                # Update weights.
                q.weights += alpha * (np.dot((G - q.evaluate(x)),x))
            t += 1
        print_episode(episode, n_episodes)
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
def differential_semi_gradient_n_step_sarsa(env, n, alpha, beta, epsilon, \
                        n_episodes, tile_coder, action_vec_dim, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim,
                     env.action_space_size)
    r_bar = 0
    states = [None] * n
    actions = np.zeros(n)
    rewards = np.zeros(n)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = obs
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)
        t = 0
        tau = -1

        while not done:
            obs, reward, done = env.step(a)
            states[(t + 1) % n] = obs
            rewards[(t + 1) % n] = reward
            a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                       env.action_space_size)
            actions[(t + 1) % n] = a
            tau = t - n + 1
            if tau > -1:
                x = tile_coder.get_feature_vector(states[tau % n],
                                                  actions[tau % n])
                x_n = tile_coder.get_feature_vector(states[(tau+n)%n], \
                                                    actions[(tau+n)%n])
                summ = np.sum(
                    [rewards[i % n] - r_bar for i in range(tau + 1, tau + n)])
                delta = summ + q.evaluate(x_n) - q.evaluate(x)
                r_bar += beta * delta
                q.weights += alpha * delta * x
            t += 1
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
예제 #3
0
def Q_learing(env, alpha, gamma, epsilon, n_episodes):
    # Initialize state-action value function.
    Q = {}
    curr_row = 0
    for row, col in env.state_space:
        for i in range(curr_row, curr_row + row):
            positions = product([i], range(col))
            velocities = product(range(-3, 1), range(-2, 3))
            states = product(positions, velocities)
            sa_pairs = product(states, range(9))
            # Key: (((pos_x, pos_y), (dy, dx)), action)
            for pair in sa_pairs:
                Q[pair] = 0
        curr_row += row

    # Store rewards for plot.
    rewards = []
    decay = lambda x: x - 2 / n_episodes if x - 2 / n_episodes > 0 else 0.1

    for episode in range(n_episodes):
        done = False
        val = 0
        obs = env.reset()

        while not done:
            action = eps_greedy_policy(Q, obs, epsilon, env.action_space_size)
            obs_prime, reward, done = env.step(action)
            val += reward
            action_values = [Q[obs_prime, i] for i in range(9)]
            opt_a = np.argmax(action_values)
            # Update state-action value estimate.
            Q[obs,action] += alpha * (reward + gamma * Q[obs_prime,opt_a] \
                             - Q[obs,action])
            obs = obs_prime
        epsilon = decay(epsilon)
        rewards.append(val)
        if episode % 10 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)

    # Plot rewards over training process.
    create_line_plot(range(len(rewards)), rewards, 'Episode number:', \
                    'Return:', 'Agent returns over training:')
    return Q
def semi_gradient_sarsa(env, alpha, gamma, epsilon, n_episodes, tile_coder,
                        action_len):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_len,
                     env.action_space_size)
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(a)
            x = tile_coder.get_feature_vector(obs, a)
            if done:
                # Update weights.
                q.weights += alpha * np.dot((reward - q.evaluate(x)), x)
            else:
                a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, \
                          tile_coder, env.action_space_size)
                x_prime = tile_coder.get_feature_vector(obs_prime, a_prime)
                # Update weights.
                q.weights += alpha * np.dot((reward + \
                             gamma * q.evaluate(x_prime) - q.evaluate(x)), x)
                obs = obs_prime
                a = a_prime
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance over training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
def differential_semi_gradient_sarsa(env, alpha, beta, epsilon, n_episodes,\
                                     tile_coder, action_vec_dim, stop_threshold):
    # Initialization.
    q = LinearPolicy(tile_coder.total_n_tiles, action_vec_dim, env.action_space_size)
    r_bar = 0
    all_steps = []

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        a = eps_greedy_func_policy(q, obs, epsilon, tile_coder, \
                                   env.action_space_size)

        while not done:
            obs_prime, reward, done = env.step(a)
            a_prime = eps_greedy_func_policy(q, obs_prime, epsilon, tile_coder,\
                                       env.action_space_size)
            x = tile_coder.get_feature_vector(obs, a)
            x_prime = tile_coder.get_feature_vector(obs_prime, a_prime)
            delta = reward - r_bar + q.evaluate(x_prime) - q.evaluate(x)
            r_bar += beta * delta
            # Update weights.
            q.weights += alpha * delta * x
            obs = obs_prime
            a = a_prime
        # Stop training if state-action value function has converged.
        if len(all_steps) > 10 and sum(all_steps[-10:]) < stop_threshold:
            break
        # Store steps for plotting.
        all_steps.append(env.steps)
        print_episode(episode, n_episodes)
    # Plot agent performance during training.
    create_line_plot(range(len(all_steps)), all_steps, 'Episode number:', \
    'Number of steps:', 'Number of steps required to reach goal during training:')
    print_episode(n_episodes, n_episodes)
    return q
        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            x_t = encode_state(states[t], env.observation_space_size)
            delta = G_t - v.evaluate(x_t)
            v.weights += alpha_w * (gamma**t) * delta * x_t
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha_th * (gamma ** t) * G_t * delta * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        returns.append(sum(rewards[1:]))
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return (policy, np.array(returns))


if __name__ == '__main__':
    gamma = 1
    alpha_w = 0.001
    alpha_th = 0.000001
    n_episodes = 1000
    env = ShortCorridor()

    all_returns = np.array([REINFORCE_baseline(env, alpha_th, alpha_w, gamma, \
                  n_episodes)[1] for i in range(150)])
    all_returns = np.sum(all_returns, axis=0)
    all_returns = all_returns / all_returns.shape[0]
    create_line_plot(range(all_returns.shape[0]), all_returns, 'Episode number:', \
    'Average return:', 'Returns averaged over 150 independent runs:')