Exemplo n.º 1
0
def test_policy(num_episodes):
    """Train and test imitation-based policy.

    Parameters
    ----------
    num_episodes: int
      Number of episodes to generate data for imitation policy.

    Returns
    -------
    final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std
    """
    with tf.Session() as sess:
        # load expert and policy model
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')
        policy = imit.load_model('CartPole-v0_config.yaml')
        policy.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

        # initialize environment
        env = gym.make('CartPole-v0')

        # generate data from expert
        states, actions = imit.generate_expert_training_data(
            expert, env, num_episodes=num_episodes, render=False)

        # train policy
        history = policy.fit(states, actions, epochs=50, verbose=2)

        # get performance values
        final_loss = history.history['loss'][-1]
        final_accuracy = history.history['acc'][-1]
        rewards = imit.test_cloned_policy(env, policy, render=False)
        mean = np.mean(rewards)
        std = np.std(rewards)

        env = imit.wrap_cartpole(env)
        hard_rewards = imit.test_cloned_policy(env, policy, render=False)
        hard_mean = np.mean(hard_rewards)
        hard_std = np.std(hard_rewards)

        return final_loss, final_accuracy, mean, std, hard_mean, hard_std
Exemplo n.º 2
0
def main():
    model_config_path = "CartPole-v0_config.yaml"
    model_weight_path = "CartPole-v0_weights.h5f"
    env = gym.make('CartPole-v0')
    #env = wrap_cartpole(env)
    clone_model = load_model(model_config_path=model_config_path)
    expert_model = load_model(model_config_path=model_config_path,
                              model_weights_path=model_weight_path)
    states, actions = generate_expert_training_data(expert_model,
                                                    env,
                                                    num_episodes=100,
                                                    render=True)
    optimizer = keras.optimizers.Adam()
    clone_model.compile(optimizer,
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
    clone_model.fit(states, actions, epochs=50)
    test_cloned_policy(env, expert_model, num_episodes=5, render=False)
    test_cloned_policy(env, clone_model, num_episodes=5, render=False)
Exemplo n.º 3
0
def test_dagger(filename='imitation_output.txt', dataname='dagger_data.csv'):
    """Get metrics for DAGGER algorithm.

    Gets necessary data to answer q1 and q2 in the extra credit portion (DAGGER)
    in Question 2.

    Parameters
    ----------
    filename: str
      Name of file to append DAGGER performance on wrapper environment to.
    dataname: str
      Name of file to write evaluation data on base env to.
    """
    with tf.Session() as sess:
        # Load expert
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')

        # Initialize environments
        env = gym.make('CartPole-v0')
        eval_env = gym.make('CartPole-v0')
        eval_env = imit.wrap_cartpole(eval_env)

        # Initialize policy model.
        policy = imit.load_model('CartPole-v0_config.yaml')
        policy.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

        # Run DAGGER
        mean_rewards, min_rewards, max_rewards = imit.dagger(
            expert, policy, env, eval_env)

        # Test on wrapper environment.
        rewards = imit.test_cloned_policy(eval_env, policy, render=False)
        hard_mean = np.mean(rewards)
        hard_std = np.std(rewards)

        # append to file
        f = open(filename, 'a+')
        f.write(DAGGER_OUTPUT % (hard_mean, hard_std))
        f.close()

        # convert data to .csv format
        data_string = "Mean,Min,Max\n"
        for i in range(len(mean_rewards)):
            data_string += "%.4f,%.4f,%.4f\n" % (
                mean_rewards[i], min_rewards[i], max_rewards[i])

        # write data to file
        f = open(dataname, 'w')
        f.write(data_string)
        f.close()
Exemplo n.º 4
0
def evaluate_expert():
    """Evaluate expert on the wrapper environment.
    Return
    -----
    mean(rewards), std(rewards)
    """
    with tf.Session() as sess:
        env = gym.make('CartPole-v0')
        env = imit.wrap_cartpole(env)
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')
        rewards = imit.test_cloned_policy(env, expert, render=False)
        return np.mean(rewards), np.std(rewards)
Exemplo n.º 5
0
 def callback(iteration, reward, model):
     if iteration == 0:
         f = open(output, 'a+')
         f.write("Iteration,Mean,Min,Max\n")
         f.close()
     if iteration % 10 == 0:
         rewards = imit.test_cloned_policy(env,
                                           model,
                                           num_episodes=100,
                                           render=False)
         f = open(output, 'a+')
         f.write("%d,%.4f,%.4f,%.4f\n" % (iteration, np.mean(rewards),
                                          np.min(rewards), np.max(rewards)))
         f.close()
Exemplo n.º 6
0
def test_reinforce(output='reinforce_data.csv'):
    """Get metrics for REINFORCE algorithm.

    Gets necessary data to answer q1 and q2 in Question 3.

    Parameters
    ----------
    output: str
      Name of file to write evaluation data on base env to.
    """
    env = gym.make('CartPole-v0')
    cb = create_callback(env, output)
    with tf.Session() as sess:
        model = reinforce.reinforce(env, sess, callback=cb)

        env = imit.wrap_cartpole(env)
        rewards = imit.test_cloned_policy(env, model, render=False)
        print("Hard Reward: %.4f +/- %.4f" %
              (np.mean(rewards), np.std(rewards)))
        f = open("reinforce_output.txt", 'a+')
        f.write("REINFORCE:\n - Hard Reward: %.4f +/- %.4f\n" %
                (np.mean(rewards), np.std(rewards)))
        f.close()
Exemplo n.º 7
0
            expert, env, num_episodes=curr_num_episodes, render=False)
        cloned_policy = Model.from_config(expert.get_config())
        cloned_policy.compile(optimizer='Adam',
                              loss='binary_crossentropy',
                              metrics=['accuracy'])
        # print states_arr.shape, actions_arr.shape
        result_metrics = cloned_policy.fit(states_arr,
                                           actions_arr,
                                           batch_size=32,
                                           epochs=50)

        # dump metrics into various lists
        loss_all.append(result_metrics.history['loss'][-1])
        accuracy_all.append(result_metrics.history['acc'][-1])

        mean_reward_cloned_curr, std_reward_cloned_curr = imitation.test_cloned_policy(
            env, cloned_policy, num_episodes=50, render=False)
        mean_reward_clones_list.append(mean_reward_cloned_curr)
        std_reward_clones_list.append(std_reward_cloned_curr)

        mean_reward_cloned_curr_wrap, std_reward_cloned_curr_wrap = imitation.test_cloned_policy(
            env_wrap, cloned_policy, num_episodes=50, render=False)
        mean_reward_clones_wrap_list.append(mean_reward_cloned_curr_wrap)
        std_reward_clones_wrap_list.append(std_reward_cloned_curr_wrap)

    # test expert
    mean_reward_expert, std_reward_expert = imitation.test_cloned_policy(
        env, expert, num_episodes=50, render=False)
    mean_reward_expert_wrap, std_reward_expert_wrap = imitation.test_cloned_policy(
        env_wrap, expert, num_episodes=50, render=False)

    print "\n\nExpert stats"
Exemplo n.º 8
0
env = gym.make('CartPole-v0')
wrapped_env = imitation.wrap_cartpole(env)
expert = imitation.load_model('CartPole-v0_config.yaml',
                              'CartPole-v0_weights.h5f')

model_config_path = 'CartPole-v0_config.yaml'
with open(model_config_path, 'r') as f:
    model_config_yaml = f.read()

num_epochs = 50
# Behaviour cloning experiments
expts = {}
# Expts with expert policy
expt_name = 'expert'
_, mean_rewards_env, std_rewards_env = imitation.test_cloned_policy(
    env, expert, render=False)
_, mean_rewards_wrapped_env, std_rewards_wrapped_env = imitation.test_cloned_policy(
    wrapped_env, expert, render=False)
expts[expt_name] = {
    'loss': 0,
    'acc': 1,
    'mean_rewards_env': mean_rewards_env,
    'std_rewards_env': std_rewards_env,
    'mean_rewards_wrapped_env': mean_rewards_wrapped_env,
    'std_rewards_wrapped_env': std_rewards_wrapped_env
}
# Expts with cloning policy
for num_eps in [1, 10, 50, 100]:
    expt_name = "clone_policy_%deps" % num_eps
    train_data = generate_training_data(env, expert, num_eps)
    cloned_policy, final_info = train_model(model_config_yaml, train_data,
Exemplo n.º 9
0
                     "--episodes",
                     dest="num_episodes",
                     default=100,
                     help="Number of episodes from expert")

if __name__ == '__main__':
    args = cmdline.parse_args()
    # Problem 2.
    print("===== Problem 2.1 =====")
    obz, act = imitation.generate_expert_training_data(expert,
                                                       env,
                                                       num_episodes=int(
                                                           args.num_episodes),
                                                       render=False)
    model = imitation.load_model(expert_yaml)
    imitation.behavior_cloning(model, obz, act)

    print("===== Problem 2.2 =====")
    imitation.test_cloned_policy(env, model, render=False)

    print("===== Problem 2.3 =====")
    harder_env = imitation.wrap_cartpole(env)
    print("> evaluate cloned model")
    imitation.test_cloned_policy(harder_env, model, render=False)
    print("> evaluate expert model")
    imitation.test_cloned_policy(harder_env, expert, render=False)

    print("===== DAGGER =====")
    model = imitation.load_model(expert_yaml)
    imitation.dagger(env, model, expert)
Exemplo n.º 10
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')
    #test all parameters
    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Prepare expert data with episodes num:', EXPERT_EPISODES)
    expert_states, expert_actions = generate_expert_training_data(
        expert, env, num_episodes=EXPERT_EPISODES, render=False)

    print('Expert data is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)
    history = LossHistory()
    learner.fit(expert_states,
                expert_actions,
                epochs=TRAIN_EPOCHS,
                callbacks=[history])
    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)
    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + ';' +
            str(learner_reward_summary[i]) + ';' +
            str(hard_expert_reward_summary[i]) + ';' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()
Exemplo n.º 11
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')

    #test all parameters

    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Generate initial data from learner')
    data, _ = generate_expert_training_data(learner,
                                            env,
                                            num_episodes=1,
                                            render=False)
    print('Qurey expert for labels ')
    q_values = expert.predict(data)
    labels = np.argmax(q_values, axis=1)
    onehot_labels = np.zeros((labels.shape[0], 2))
    for i in range(labels.shape[0]):
        onehot_labels[i, labels[i]] = 1
    #print(onehot_labels)
    #print(onehot_labels.shape)
    print('Expert qurey is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)

    history = LossHistory()
    train_cnt = 0
    while train_cnt < TRAIN_EPOCHS:
        learner.fit(data, onehot_labels, epochs=1, callbacks=[history])
        #generate new data for DAAGER
        # use the same function as generate expert, but using the learner model
        new_data, _ = generate_expert_training_data(learner,
                                                    env,
                                                    num_episodes=1,
                                                    render=False)
        print('Qurey expert for labels ')
        new_q_values = expert.predict(new_data)
        new_labels = np.argmax(new_q_values, axis=1)
        new_onehot_labels = np.zeros((new_labels.shape[0], 2))
        for i in range(new_labels.shape[0]):
            new_onehot_labels[i, new_labels[i]] = 1
        data = np.vstack((data, new_data))
        onehot_labels = np.vstack((onehot_labels, new_onehot_labels))
        print(onehot_labels.shape)
        train_cnt = train_cnt + 1

    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)

    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + '    ' +
            str(learner_reward_summary[i]) + '    ' +
            str(hard_expert_reward_summary[i]) + '    ' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()