예제 #1
0
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \
                                    gamma, n_episodes):

    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)
    z_th = np.zeros(env.observation_space_size * env.action_space_size)
    z_w = np.zeros(env.observation_space_size)
    R_bar = 0

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                        env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate(
                obs_vec)
            R_bar += eta * delta
            z_w = lambda_w * z_w + obs_vec
            z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs)
            v.weights += alpha_w * delta * z_w
            policy.weights += alpha_th * delta * z_th
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)
        I = 1

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                       env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            v.weights += alpha_w * I  * delta * obs_vec
            policy.weights += alpha_th  * I * delta * policy.eligibility_vector(a,
            sa_pairs)
            I *= I
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
예제 #3
0
def least_squares_td(env, policy, epsilon, alpha, gamma, n_episodes, tile_coder):
    # Initialization.
    n = tile_coder.total_n_tiles
    A = (1/epsilon) * np.eye(n)
    b = np.zeros((n,1))
    d = np.zeros((n,1))

    for episode in range(n_episodes):
        done = False
        obs = env.reset()

        while not done:
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            obs_prime, reward, done = env.step(a)
            x = np.array(tile_coder.get_tile_code(obs)).reshape(-1,1)
            x_prime = np.array(tile_coder.get_tile_code(obs_prime)).reshape(-1,1)
            b = b + reward * x
            d = (x - gamma * x_prime)
            A = A + x @ d.T
            if env.steps == 2:
                inv_A = np.linalg.inv(A)
            else:
                t = np.eye(n) - (((x @ d.T)/(1 + ((d.T @ inv_A) @ x))) @ nv_A)
                inv_A = inv_A @ t
            theta = inv_A @ b
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    v = LinearValueFunction(tile_coder.total_n_tiles)
    v.weights = theta.flatten()
    return v
def online_td_lambda(env, lamda, alpha, gamma, n_episodes):
    # Initialize value function.
    v = LinearValueFunction(env.n_states)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.n_states)
        z = np.zeros(env.n_states)
        V_old = 0

        while not done:
            obs_prime, reward, done = env.step()
            obs_prime_vec = encode_state(obs_prime, env.n_states)
            V = v.evaluate(obs_vec)
            V_prime = v.evaluate(obs_prime_vec)
            delta = reward + gamma * V_prime - V
            # Update eligibility traces.
            z = gamma * lamda * z + (
                1 - alpha * gamma * lamda * np.dot(z, obs_vec)) * obs_vec
            # Update weights.
            v.weights += alpha * (delta + V -
                                  V_old) * z - alpha * (V - V_old) * obs_vec
            V_old = V_prime
            obs_vec = obs_prime_vec
    return v
예제 #5
0
def gradient_mc_prediction(env, policy, alpha, n_episodes, tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)
    states = []
    rewards = [None]

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        # Store the feature vector representation of the state.
        states.append(tile_coder.get_tile_code(obs))
        feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                          env.action_space_size)
        a = policy.greedy_action(feature_vectors)

        while not done:
            obs, reward, done = env.step(a)
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            rewards.append(reward)
            states.append(tile_coder.get_tile_code(obs))

        for i in range(len(states)):
            G = np.sum(rewards[i + 1:])
            # Update weights.
            v.weights += alpha * np.dot((G - v.evaluate(states[i])), states[i])
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v
예제 #6
0
def semi_gradient_td_lambda(env, lamda, alpha, gamma, n_episodes):
    # Initialize value function.
    v = LinearValueFunction(env.n_states)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.n_states)
        z = np.zeros(env.n_states)

        while not done:
            obs_prime, reward, done = env.step()
            obs_prime_vec = encode_state(obs_prime, env.n_states)
            # Update eligibility traces.
            z = gamma * lamda * z  + obs_vec
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            # Update weights.
            v.weights += alpha * delta * z
            obs_vec = obs_prime_vec
    return v
예제 #7
0
def semi_gradient_td_zero(env, policy, alpha, gamma, n_episodes, tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        while not done:
            feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                              env.action_space_size)
            a = policy.greedy_action(feature_vectors)
            obs_prime, reward, done = env.step(a)
            s = tile_coder.get_tile_code(obs)
            s_prime = tile_coder.get_tile_code(obs_prime)
            # Update weights.
            v.weights += alpha * (np.dot((reward + gamma*v.evaluate(s_prime)- \
                                  v.evaluate(s)), s))
            obs = obs_prime
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    returns = []
    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            x_t = encode_state(states[t], env.observation_space_size)
            delta = G_t - v.evaluate(x_t)
            v.weights += alpha_w * (gamma**t) * delta * x_t
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha_th * (gamma ** t) * G_t * delta * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        returns.append(sum(rewards[1:]))
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return (policy, np.array(returns))
예제 #9
0
def semi_gradient_n_step_td(env, policy, n, alpha, gamma, n_episodes,
                            tile_coder):
    # Initialization.
    v = LinearValueFunction(tile_coder.total_n_tiles)
    states = [None] * n
    rewards = np.zeros(n)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        states[0] = tile_coder.get_tile_code(obs)
        t = 0
        tau = -1
        T = np.inf

        while not done or tau != T - 1:
            if t < T:
                feature_vectors = tile_coder.get_feature_vectors_for_actions(obs, \
                                  env.action_space_size)
                a = policy.greedy_action(feature_vectors)
                obs, reward, done = env.step(a)
                states[(t + 1) % n] = tile_coder.get_tile_code(obs)
                rewards[(t + 1) % n] = reward
                if done:
                    T = t + 1
            tau = t - n + 1
            if tau > -1:
                # Calculate n-step return.
                G = np.sum([gamma**(i-tau-1)*rewards[i%n] for i in range(tau+1, \
                            min(tau+n, T))])
                if tau + n < T:
                    G += gamma**n * v.evaluate(states[(tau + n) % n])
                # Update weights.
                v.weights += alpha * np.dot((G-v.evaluate(states[tau%n])), \
                                             states[tau%n])
            t += 1
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return v