예제 #1
0
def test_policy(num_episodes):
    """Train and test imitation-based policy.

    Parameters
    ----------
    num_episodes: int
      Number of episodes to generate data for imitation policy.

    Returns
    -------
    final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std
    """
    with tf.Session() as sess:
        # load expert and policy model
        expert = imit.load_model('CartPole-v0_config.yaml',
                                 'CartPole-v0_weights.h5f')
        policy = imit.load_model('CartPole-v0_config.yaml')
        policy.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

        # initialize environment
        env = gym.make('CartPole-v0')

        # generate data from expert
        states, actions = imit.generate_expert_training_data(
            expert, env, num_episodes=num_episodes, render=False)

        # train policy
        history = policy.fit(states, actions, epochs=50, verbose=2)

        # get performance values
        final_loss = history.history['loss'][-1]
        final_accuracy = history.history['acc'][-1]
        rewards = imit.test_cloned_policy(env, policy, render=False)
        mean = np.mean(rewards)
        std = np.std(rewards)

        env = imit.wrap_cartpole(env)
        hard_rewards = imit.test_cloned_policy(env, policy, render=False)
        hard_mean = np.mean(hard_rewards)
        hard_std = np.std(hard_rewards)

        return final_loss, final_accuracy, mean, std, hard_mean, hard_std
예제 #2
0
def main():
    model_config_path = "CartPole-v0_config.yaml"
    model_weight_path = "CartPole-v0_weights.h5f"
    env = gym.make('CartPole-v0')
    #env = wrap_cartpole(env)
    clone_model = load_model(model_config_path=model_config_path)
    expert_model = load_model(model_config_path=model_config_path,
                              model_weights_path=model_weight_path)
    states, actions = generate_expert_training_data(expert_model,
                                                    env,
                                                    num_episodes=100,
                                                    render=True)
    optimizer = keras.optimizers.Adam()
    clone_model.compile(optimizer,
                        loss='binary_crossentropy',
                        metrics=['accuracy'])
    clone_model.fit(states, actions, epochs=50)
    test_cloned_policy(env, expert_model, num_episodes=5, render=False)
    test_cloned_policy(env, clone_model, num_episodes=5, render=False)
예제 #3
0
    expert = imitation.load_model('CartPole-v0_config.yaml',
                                  'CartPole-v0_weights.h5f')
    # test_cloned_policy(env, cloned_policy)
    episode_length_list = [1, 10, 50, 100]
    loss_all, accuracy_all = [], []
    mean_reward_clones_list, mean_reward_clones_wrap_list = [], []
    std_reward_clones_list, std_reward_clones_wrap_list = [], []

    for curr_num_episodes in episode_length_list:
        str_1 = "Imitator with number of episodes = {}".format(
            curr_num_episodes)
        msg = "\n%s\n" % (LINE) + "%s%s\n" % (BOLD, str_1) + "%s\n" % (LINE)
        print(str(msg))

        # train on vanilla env
        states_arr, actions_arr = imitation.generate_expert_training_data(
            expert, env, num_episodes=curr_num_episodes, render=False)
        cloned_policy = Model.from_config(expert.get_config())
        cloned_policy.compile(optimizer='Adam',
                              loss='binary_crossentropy',
                              metrics=['accuracy'])
        # print states_arr.shape, actions_arr.shape
        result_metrics = cloned_policy.fit(states_arr,
                                           actions_arr,
                                           batch_size=32,
                                           epochs=50)

        # dump metrics into various lists
        loss_all.append(result_metrics.history['loss'][-1])
        accuracy_all.append(result_metrics.history['acc'][-1])

        mean_reward_cloned_curr, std_reward_cloned_curr = imitation.test_cloned_policy(
예제 #4
0
파일: q2.py 프로젝트: afcarl/LQR_iLQR_MPC
def generate_training_data(env, expert, num_episodes):
    return imitation.generate_expert_training_data(expert,
                                                   env,
                                                   num_episodes,
                                                   render=False)
예제 #5
0
expert = imitation.load_model(expert_yaml, expert_h5f)
env = gym.make('CartPole-v0')
cmdline = argparse.ArgumentParser()
cmdline.add_argument("-e",
                     "--episodes",
                     dest="num_episodes",
                     default=100,
                     help="Number of episodes from expert")

if __name__ == '__main__':
    args = cmdline.parse_args()
    # Problem 2.
    print("===== Problem 2.1 =====")
    obz, act = imitation.generate_expert_training_data(expert,
                                                       env,
                                                       num_episodes=int(
                                                           args.num_episodes),
                                                       render=False)
    model = imitation.load_model(expert_yaml)
    imitation.behavior_cloning(model, obz, act)

    print("===== Problem 2.2 =====")
    imitation.test_cloned_policy(env, model, render=False)

    print("===== Problem 2.3 =====")
    harder_env = imitation.wrap_cartpole(env)
    print("> evaluate cloned model")
    imitation.test_cloned_policy(harder_env, model, render=False)
    print("> evaluate expert model")
    imitation.test_cloned_policy(harder_env, expert, render=False)
예제 #6
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')
    #test all parameters
    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Prepare expert data with episodes num:', EXPERT_EPISODES)
    expert_states, expert_actions = generate_expert_training_data(
        expert, env, num_episodes=EXPERT_EPISODES, render=False)

    print('Expert data is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)
    history = LossHistory()
    learner.fit(expert_states,
                expert_actions,
                epochs=TRAIN_EPOCHS,
                callbacks=[history])
    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)
    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + ';' +
            str(learner_reward_summary[i]) + ';' +
            str(hard_expert_reward_summary[i]) + ';' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()
예제 #7
0
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path):

    file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.txt'
    f = open(file_path, 'w')
    f.write('Parameters:\n')
    f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n')
    f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n')

    #test all parameters

    expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f')
    learner = load_model('CartPole-v0_config.yaml', None)
    adam = Adam()
    expert.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    learner.compile(adam, 'binary_crossentropy', metrics=['accuracy'])
    print('Generate initial data from learner')
    data, _ = generate_expert_training_data(learner,
                                            env,
                                            num_episodes=1,
                                            render=False)
    print('Qurey expert for labels ')
    q_values = expert.predict(data)
    labels = np.argmax(q_values, axis=1)
    onehot_labels = np.zeros((labels.shape[0], 2))
    for i in range(labels.shape[0]):
        onehot_labels[i, labels[i]] = 1
    #print(onehot_labels)
    #print(onehot_labels.shape)
    print('Expert qurey is ready. Start to train learner with epoch num:',
          TRAIN_EPOCHS)

    history = LossHistory()
    train_cnt = 0
    while train_cnt < TRAIN_EPOCHS:
        learner.fit(data, onehot_labels, epochs=1, callbacks=[history])
        #generate new data for DAAGER
        # use the same function as generate expert, but using the learner model
        new_data, _ = generate_expert_training_data(learner,
                                                    env,
                                                    num_episodes=1,
                                                    render=False)
        print('Qurey expert for labels ')
        new_q_values = expert.predict(new_data)
        new_labels = np.argmax(new_q_values, axis=1)
        new_onehot_labels = np.zeros((new_labels.shape[0], 2))
        for i in range(new_labels.shape[0]):
            new_onehot_labels[i, new_labels[i]] = 1
        data = np.vstack((data, new_data))
        onehot_labels = np.vstack((onehot_labels, new_onehot_labels))
        print(onehot_labels.shape)
        train_cnt = train_cnt + 1

    weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str(
        TRAIN_EPOCHS) + '.h5'
    learner.save_weights(weights_path)

    print('Test expert in normal env.........................................')
    expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy(
        env, expert, num_episodes=100, render=False)
    print(
        'Test learner in normal env.........................................')
    learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy(
        env, learner, num_episodes=100, render=False)

    print('Test expert in hard Env.........................................')
    hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy(
        env_hard, expert, num_episodes=100, render=False)
    print('Test learner in hard Env.........................................')
    hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy(
        env_hard, learner, num_episodes=100, render=False)

    f.write('Expert Test in Normal Env:\n')
    f.write(str(expert_reward_avg) + '    ' + str(expert_reward_std) + '\n')
    f.write('Learner Test in Normal Env:\n')
    f.write(str(learner_reward_avg) + '    ' + str(learner_reward_std) + '\n')
    f.write('Expert Test in Hard Env:\n')
    f.write(
        str(hard_expert_reward_avg) + '    ' + str(hard_expert_reward_std) +
        '\n')
    f.write('Learner Test in Hard Env:\n')
    f.write(
        str(hard_learner_reward_avg) + '    ' + str(hard_learner_reward_std) +
        '\n')
    f.write('Learner Training  History:\n')
    for i in range(TRAIN_EPOCHS):
        f.write(
            str(history.losses[i]) + '    ' + str(history.accues[i]) + '\n')

    f.write('Evaluate History:\n')
    for i in range(100):
        f.write(
            str(expert_reward_summary[i]) + '    ' +
            str(learner_reward_summary[i]) + '    ' +
            str(hard_expert_reward_summary[i]) + '    ' +
            str(hard_learner_reward_summary[i]) + '\n')

    f.close()