示例#1
0
def test_dagger(filename='imitation_output.txt', dataname='dagger_data.csv'):
    """Get metrics for DAGGER algorithm.

    Gets necessary data to answer q1 and q2 in the extra credit portion (DAGGER)
    in Question 2.

    Parameters
    ----------
    filename: str
      Name of file to append DAGGER performance on wrapper environment to.
    dataname: str
      Name of file to write evaluation data on base env to.
    """
    with tf.Session() as sess:
        # Load expert
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')

        # Initialize environments
        env = gym.make('CartPole-v0')
        eval_env = gym.make('CartPole-v0')
        eval_env = imit.wrap_cartpole(eval_env)

        # Initialize policy model.
        policy = imit.load_model('CartPole-v0_config.yaml')
        policy.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

        # Run DAGGER
        mean_rewards, min_rewards, max_rewards = imit.dagger(
            expert, policy, env, eval_env)

        # Test on wrapper environment.
        rewards = imit.test_cloned_policy(eval_env, policy, render=False)
        hard_mean = np.mean(rewards)
        hard_std = np.std(rewards)

        # append to file
        f = open(filename, 'a+')
        f.write(DAGGER_OUTPUT % (hard_mean, hard_std))
        f.close()

        # convert data to .csv format
        data_string = "Mean,Min,Max\n"
        for i in range(len(mean_rewards)):
            data_string += "%.4f,%.4f,%.4f\n" % (
                mean_rewards[i], min_rewards[i], max_rewards[i])

        # write data to file
        f = open(dataname, 'w')
        f.write(data_string)
        f.close()
示例#2
0
def test_policy(num_episodes):
    """Train and test imitation-based policy.

    Parameters
    ----------
    num_episodes: int
      Number of episodes to generate data for imitation policy.

    Returns
    -------
    final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std
    """
    with tf.Session() as sess:
        # load expert and policy model
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')
        policy = imit.load_model('CartPole-v0_config.yaml')
        policy.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

        # initialize environment
        env = gym.make('CartPole-v0')

        # generate data from expert
        states, actions = imit.generate_expert_training_data(
            expert, env, num_episodes=num_episodes, render=False)

        # train policy
        history = policy.fit(states, actions, epochs=50, verbose=2)

        # get performance values
        final_loss = history.history['loss'][-1]
        final_accuracy = history.history['acc'][-1]
        rewards = imit.test_cloned_policy(env, policy, render=False)
        mean = np.mean(rewards)
        std = np.std(rewards)

        env = imit.wrap_cartpole(env)
        hard_rewards = imit.test_cloned_policy(env, policy, render=False)
        hard_mean = np.mean(hard_rewards)
        hard_std = np.std(hard_rewards)

        return final_loss, final_accuracy, mean, std, hard_mean, hard_std
示例#3
0
def main():
    model_config_path = "CartPole-v0_config.yaml"
    model_weight_path = "CartPole-v0_weights.h5f"
    env = gym.make('CartPole-v0')
    #env = wrap_cartpole(env)
    clone_model = load_model(model_config_path=model_config_path)
    expert_model = load_model(model_config_path=model_config_path,
                              model_weights_path=model_weight_path)
    states, actions = generate_expert_training_data(expert_model,
                                                    env,
                                                    num_episodes=100,
                                                    render=True)
    optimizer = keras.optimizers.Adam()
    clone_model.compile(optimizer,
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
    clone_model.fit(states, actions, epochs=50)
    test_cloned_policy(env, expert_model, num_episodes=5, render=False)
    test_cloned_policy(env, clone_model, num_episodes=5, render=False)
示例#4
0
def evaluate_expert():
    """Evaluate expert on the wrapper environment.
    Return
    -----
    mean(rewards), std(rewards)
    """
    with tf.Session() as sess:
        env = gym.make('CartPole-v0')
        env = imit.wrap_cartpole(env)
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')
        rewards = imit.test_cloned_policy(env, expert, render=False)
        return np.mean(rewards), np.std(rewards)
示例#5
0
        f.write("%s\n" % each_thing)


if __name__ == '__main__':
    # fancy printing
    RED = '\033[91m'
    BOLD = '\033[1m'
    ENDC = '\033[0m'
    LINE = "%s%s##############################################################################%s" % (
        RED, BOLD, ENDC)

    env = gym.make('CartPole-v0')
    env_wrap = gym.make('CartPole-v0')
    env_wrap = imitation.wrap_cartpole(env_wrap)

    expert = imitation.load_model('CartPole-v0_config.yaml',
                                  'CartPole-v0_weights.h5f')
    # test_cloned_policy(env, cloned_policy)
    episode_length_list = [1, 10, 50, 100]
    loss_all, accuracy_all = [], []
    mean_reward_clones_list, mean_reward_clones_wrap_list = [], []
    std_reward_clones_list, std_reward_clones_wrap_list = [], []

    for curr_num_episodes in episode_length_list:
        str_1 = "Imitator with number of episodes = {}".format(
            curr_num_episodes)
        msg = "\n%s\n" % (LINE) + "%s%s\n" % (BOLD, str_1) + "%s\n" % (LINE)
        print(str(msg))

        # train on vanilla env
        states_arr, actions_arr = imitation.generate_expert_training_data(
            expert, env, num_episodes=curr_num_episodes, render=False)
示例#6
0
import os
import time
from deeprl_hw3.imitation import load_model
from deeprl_hw3.reinforce2 import run_one_episode, train_nn, get_total_reward
from keras.optimizers import Adam
from keras import backend as K

MAX_TRAIN_EPOCHS = 10000
EVA_INTERVAL = 10
gamma = 0.99
LR = 0.001
STEP_SIZE = 0.001
env = gym.make('CartPole-v0')
EVAL_EPISODES = 100
sess = K.get_session()
nn = load_model('CartPole-v0_config.yaml', None)
nn.compile('SGD', 'mse', metrics=['accuracy'])
sess.run(tf.global_variables_initializer())
file_path = 'Q3.txt'
f = open(file_path, 'w')

train_cnt = 0
eval_cnt = 0
end_cnt = 0
train_flag = True
while train_flag == True:
    train_start = time.time()
    states, actions, rewards, softmaxes, dones = run_one_episode(env, nn)

    #print(len(states))
    '''
示例#7
0
from deeprl_hw3 import imitation
import gym
import argparse
import os

expert_yaml = os.path.join(os.getcwd(), 'CartPole-v0_config.yaml')
expert_h5f = os.path.join(os.getcwd(), 'CartPole-v0_weights.h5f')

expert = imitation.load_model(expert_yaml, expert_h5f)
env = gym.make('CartPole-v0')
cmdline = argparse.ArgumentParser()
cmdline.add_argument("-e",
                     "--episodes",
                     dest="num_episodes",
                     default=100,
                     help="Number of episodes from expert")

if __name__ == '__main__':
    args = cmdline.parse_args()
    # Problem 2.
    print("===== Problem 2.1 =====")
    obz, act = imitation.generate_expert_training_data(expert,
                                                       env,
                                                       num_episodes=int(
                                                           args.num_episodes),
                                                       render=False)
    model = imitation.load_model(expert_yaml)
    imitation.behavior_cloning(model, obz, act)

    print("===== Problem 2.2 =====")
    imitation.test_cloned_policy(env, model, render=False)
示例#8
0
def reinforce(env, sess, gamma = 0.98, alpha = 0.00025, callback = None):
    """Policy gradient algorithm

    Parameters
    ----------
    env: gym.core.Env
      Environment being run on.
    sess: tf.Session
      Tensorflow session for convenience.
    gamma: float
      Gamma value used for discounting and in the policy gradient algorithm.
    alpha: float
      Alpha value used in the policy gradient algorithm.
    callback: function
      Callback used to log learning metrics

    Returns
    -------
    model: Keras model trained with policy gradient descent.
    """

    # Initialize model
    model = imit.load_model('CartPole-v0_config.yaml')
    tf.initialize_all_variables()

    # Create gradient
    action_input = tf.placeholder(dtype=tf.int32, shape=(1,))
    loss = tf.log(tf.gather(tf.reshape(model.output, (2,)), [action_input]))
    grads = tf.gradients(loss, model.weights)

    # Helper to compute gradient based on state-action pair
    def get_gradient(state, action):
        return sess.run(grads, feed_dict = {
                model.input : state,
                action_input : [action]
            })

    rewards = []
    iteration = 0
    while True:
        # Run model for an episode
        S, A, R = run_episode(env, model)

        reward = sum(R)
        rewards.append(reward)

        if callback != None:
            callback(iteration, reward, model)

        print("REWARD (%d): %.4f" % (iteration, reward))

        # Convergence condition is having a sufficiently small std with a sufficiently large mean
        # over the past 20 rewards.
        if len(rewards) > 20 and np.std(np.array(rewards[-20:])) < 3. and np.mean(np.array(rewards[-5:])) > 50.:
            print("CONVERGED")
            return model

        # Get discounted rewards.
        G = process_rewards(R, gamma)

        # Update weights w.r.t. gradients.
        weights = model.get_weights()
        for t in range(len(S)):
            gradients = get_gradient(S[t].reshape((1,4)), A[t])

            assert(len(weights) == len(gradients))

            for i in range(len(weights)):
                weights[i] += alpha * G[t] *(gamma**t)* gradients[i]

        model.set_weights(weights)
        iteration += 1
示例#9
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')
    #test all parameters
    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Prepare expert data with episodes num:', EXPERT_EPISODES)
    expert_states, expert_actions = generate_expert_training_data(
        expert, env, num_episodes=EXPERT_EPISODES, render=False)

    print('Expert data is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)
    history = LossHistory()
    learner.fit(expert_states,
                expert_actions,
                epochs=TRAIN_EPOCHS,
                callbacks=[history])
    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)
    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + ';' +
            str(learner_reward_summary[i]) + ';' +
            str(hard_expert_reward_summary[i]) + ';' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()
示例#10
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')

    #test all parameters

    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Generate initial data from learner')
    data, _ = generate_expert_training_data(learner,
                                            env,
                                            num_episodes=1,
                                            render=False)
    print('Qurey expert for labels ')
    q_values = expert.predict(data)
    labels = np.argmax(q_values, axis=1)
    onehot_labels = np.zeros((labels.shape[0], 2))
    for i in range(labels.shape[0]):
        onehot_labels[i, labels[i]] = 1
    #print(onehot_labels)
    #print(onehot_labels.shape)
    print('Expert qurey is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)

    history = LossHistory()
    train_cnt = 0
    while train_cnt < TRAIN_EPOCHS:
        learner.fit(data, onehot_labels, epochs=1, callbacks=[history])
        #generate new data for DAAGER
        # use the same function as generate expert, but using the learner model
        new_data, _ = generate_expert_training_data(learner,
                                                    env,
                                                    num_episodes=1,
                                                    render=False)
        print('Qurey expert for labels ')
        new_q_values = expert.predict(new_data)
        new_labels = np.argmax(new_q_values, axis=1)
        new_onehot_labels = np.zeros((new_labels.shape[0], 2))
        for i in range(new_labels.shape[0]):
            new_onehot_labels[i, new_labels[i]] = 1
        data = np.vstack((data, new_data))
        onehot_labels = np.vstack((onehot_labels, new_onehot_labels))
        print(onehot_labels.shape)
        train_cnt = train_cnt + 1

    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)

    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + '    ' +
            str(learner_reward_summary[i]) + '    ' +
            str(hard_expert_reward_summary[i]) + '    ' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()