def test_policy(num_episodes): """Train and test imitation-based policy. Parameters ---------- num_episodes: int Number of episodes to generate data for imitation policy. Returns ------- final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std """ with tf.Session() as sess: # load expert and policy model expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') policy = imit.load_model('CartPole-v0_config.yaml') policy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # initialize environment env = gym.make('CartPole-v0') # generate data from expert states, actions = imit.generate_expert_training_data( expert, env, num_episodes=num_episodes, render=False) # train policy history = policy.fit(states, actions, epochs=50, verbose=2) # get performance values final_loss = history.history['loss'][-1] final_accuracy = history.history['acc'][-1] rewards = imit.test_cloned_policy(env, policy, render=False) mean = np.mean(rewards) std = np.std(rewards) env = imit.wrap_cartpole(env) hard_rewards = imit.test_cloned_policy(env, policy, render=False) hard_mean = np.mean(hard_rewards) hard_std = np.std(hard_rewards) return final_loss, final_accuracy, mean, std, hard_mean, hard_std
def main(): model_config_path = "CartPole-v0_config.yaml" model_weight_path = "CartPole-v0_weights.h5f" env = gym.make('CartPole-v0') #env = wrap_cartpole(env) clone_model = load_model(model_config_path=model_config_path) expert_model = load_model(model_config_path=model_config_path, model_weights_path=model_weight_path) states, actions = generate_expert_training_data(expert_model, env, num_episodes=100, render=True) optimizer = keras.optimizers.Adam() clone_model.compile(optimizer, loss='binary_crossentropy', metrics=['accuracy']) clone_model.fit(states, actions, epochs=50) test_cloned_policy(env, expert_model, num_episodes=5, render=False) test_cloned_policy(env, clone_model, num_episodes=5, render=False)
expert = imitation.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') # test_cloned_policy(env, cloned_policy) episode_length_list = [1, 10, 50, 100] loss_all, accuracy_all = [], [] mean_reward_clones_list, mean_reward_clones_wrap_list = [], [] std_reward_clones_list, std_reward_clones_wrap_list = [], [] for curr_num_episodes in episode_length_list: str_1 = "Imitator with number of episodes = {}".format( curr_num_episodes) msg = "\n%s\n" % (LINE) + "%s%s\n" % (BOLD, str_1) + "%s\n" % (LINE) print(str(msg)) # train on vanilla env states_arr, actions_arr = imitation.generate_expert_training_data( expert, env, num_episodes=curr_num_episodes, render=False) cloned_policy = Model.from_config(expert.get_config()) cloned_policy.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy']) # print states_arr.shape, actions_arr.shape result_metrics = cloned_policy.fit(states_arr, actions_arr, batch_size=32, epochs=50) # dump metrics into various lists loss_all.append(result_metrics.history['loss'][-1]) accuracy_all.append(result_metrics.history['acc'][-1]) mean_reward_cloned_curr, std_reward_cloned_curr = imitation.test_cloned_policy(
def generate_training_data(env, expert, num_episodes): return imitation.generate_expert_training_data(expert, env, num_episodes, render=False)
expert = imitation.load_model(expert_yaml, expert_h5f) env = gym.make('CartPole-v0') cmdline = argparse.ArgumentParser() cmdline.add_argument("-e", "--episodes", dest="num_episodes", default=100, help="Number of episodes from expert") if __name__ == '__main__': args = cmdline.parse_args() # Problem 2. print("===== Problem 2.1 =====") obz, act = imitation.generate_expert_training_data(expert, env, num_episodes=int( args.num_episodes), render=False) model = imitation.load_model(expert_yaml) imitation.behavior_cloning(model, obz, act) print("===== Problem 2.2 =====") imitation.test_cloned_policy(env, model, render=False) print("===== Problem 2.3 =====") harder_env = imitation.wrap_cartpole(env) print("> evaluate cloned model") imitation.test_cloned_policy(harder_env, model, render=False) print("> evaluate expert model") imitation.test_cloned_policy(harder_env, expert, render=False)
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Prepare expert data with episodes num:', EXPERT_EPISODES) expert_states, expert_actions = generate_expert_training_data( expert, env, num_episodes=EXPERT_EPISODES, render=False) print('Expert data is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() learner.fit(expert_states, expert_actions, epochs=TRAIN_EPOCHS, callbacks=[history]) weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ';' + str(learner_reward_summary[i]) + ';' + str(hard_expert_reward_summary[i]) + ';' + str(hard_learner_reward_summary[i]) + '\n') f.close()
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Generate initial data from learner') data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') q_values = expert.predict(data) labels = np.argmax(q_values, axis=1) onehot_labels = np.zeros((labels.shape[0], 2)) for i in range(labels.shape[0]): onehot_labels[i, labels[i]] = 1 #print(onehot_labels) #print(onehot_labels.shape) print('Expert qurey is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() train_cnt = 0 while train_cnt < TRAIN_EPOCHS: learner.fit(data, onehot_labels, epochs=1, callbacks=[history]) #generate new data for DAAGER # use the same function as generate expert, but using the learner model new_data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') new_q_values = expert.predict(new_data) new_labels = np.argmax(new_q_values, axis=1) new_onehot_labels = np.zeros((new_labels.shape[0], 2)) for i in range(new_labels.shape[0]): new_onehot_labels[i, new_labels[i]] = 1 data = np.vstack((data, new_data)) onehot_labels = np.vstack((onehot_labels, new_onehot_labels)) print(onehot_labels.shape) train_cnt = train_cnt + 1 weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ' ' + str(learner_reward_summary[i]) + ' ' + str(hard_expert_reward_summary[i]) + ' ' + str(hard_learner_reward_summary[i]) + '\n') f.close()