def test_policy(num_episodes): """Train and test imitation-based policy. Parameters ---------- num_episodes: int Number of episodes to generate data for imitation policy. Returns ------- final loss, final accuracy, mean reward, reward std, wrapper mean reward, wrapper reward std """ with tf.Session() as sess: # load expert and policy model expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') policy = imit.load_model('CartPole-v0_config.yaml') policy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # initialize environment env = gym.make('CartPole-v0') # generate data from expert states, actions = imit.generate_expert_training_data( expert, env, num_episodes=num_episodes, render=False) # train policy history = policy.fit(states, actions, epochs=50, verbose=2) # get performance values final_loss = history.history['loss'][-1] final_accuracy = history.history['acc'][-1] rewards = imit.test_cloned_policy(env, policy, render=False) mean = np.mean(rewards) std = np.std(rewards) env = imit.wrap_cartpole(env) hard_rewards = imit.test_cloned_policy(env, policy, render=False) hard_mean = np.mean(hard_rewards) hard_std = np.std(hard_rewards) return final_loss, final_accuracy, mean, std, hard_mean, hard_std
def main(): model_config_path = "CartPole-v0_config.yaml" model_weight_path = "CartPole-v0_weights.h5f" env = gym.make('CartPole-v0') #env = wrap_cartpole(env) clone_model = load_model(model_config_path=model_config_path) expert_model = load_model(model_config_path=model_config_path, model_weights_path=model_weight_path) states, actions = generate_expert_training_data(expert_model, env, num_episodes=100, render=True) optimizer = keras.optimizers.Adam() clone_model.compile(optimizer, loss='binary_crossentropy', metrics=['accuracy']) clone_model.fit(states, actions, epochs=50) test_cloned_policy(env, expert_model, num_episodes=5, render=False) test_cloned_policy(env, clone_model, num_episodes=5, render=False)
def test_dagger(filename='imitation_output.txt', dataname='dagger_data.csv'): """Get metrics for DAGGER algorithm. Gets necessary data to answer q1 and q2 in the extra credit portion (DAGGER) in Question 2. Parameters ---------- filename: str Name of file to append DAGGER performance on wrapper environment to. dataname: str Name of file to write evaluation data on base env to. """ with tf.Session() as sess: # Load expert expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') # Initialize environments env = gym.make('CartPole-v0') eval_env = gym.make('CartPole-v0') eval_env = imit.wrap_cartpole(eval_env) # Initialize policy model. policy = imit.load_model('CartPole-v0_config.yaml') policy.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Run DAGGER mean_rewards, min_rewards, max_rewards = imit.dagger( expert, policy, env, eval_env) # Test on wrapper environment. rewards = imit.test_cloned_policy(eval_env, policy, render=False) hard_mean = np.mean(rewards) hard_std = np.std(rewards) # append to file f = open(filename, 'a+') f.write(DAGGER_OUTPUT % (hard_mean, hard_std)) f.close() # convert data to .csv format data_string = "Mean,Min,Max\n" for i in range(len(mean_rewards)): data_string += "%.4f,%.4f,%.4f\n" % ( mean_rewards[i], min_rewards[i], max_rewards[i]) # write data to file f = open(dataname, 'w') f.write(data_string) f.close()
def evaluate_expert(): """Evaluate expert on the wrapper environment. Return ----- mean(rewards), std(rewards) """ with tf.Session() as sess: env = gym.make('CartPole-v0') env = imit.wrap_cartpole(env) expert = imit.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') rewards = imit.test_cloned_policy(env, expert, render=False) return np.mean(rewards), np.std(rewards)
def callback(iteration, reward, model): if iteration == 0: f = open(output, 'a+') f.write("Iteration,Mean,Min,Max\n") f.close() if iteration % 10 == 0: rewards = imit.test_cloned_policy(env, model, num_episodes=100, render=False) f = open(output, 'a+') f.write("%d,%.4f,%.4f,%.4f\n" % (iteration, np.mean(rewards), np.min(rewards), np.max(rewards))) f.close()
def test_reinforce(output='reinforce_data.csv'): """Get metrics for REINFORCE algorithm. Gets necessary data to answer q1 and q2 in Question 3. Parameters ---------- output: str Name of file to write evaluation data on base env to. """ env = gym.make('CartPole-v0') cb = create_callback(env, output) with tf.Session() as sess: model = reinforce.reinforce(env, sess, callback=cb) env = imit.wrap_cartpole(env) rewards = imit.test_cloned_policy(env, model, render=False) print("Hard Reward: %.4f +/- %.4f" % (np.mean(rewards), np.std(rewards))) f = open("reinforce_output.txt", 'a+') f.write("REINFORCE:\n - Hard Reward: %.4f +/- %.4f\n" % (np.mean(rewards), np.std(rewards))) f.close()
expert, env, num_episodes=curr_num_episodes, render=False) cloned_policy = Model.from_config(expert.get_config()) cloned_policy.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy']) # print states_arr.shape, actions_arr.shape result_metrics = cloned_policy.fit(states_arr, actions_arr, batch_size=32, epochs=50) # dump metrics into various lists loss_all.append(result_metrics.history['loss'][-1]) accuracy_all.append(result_metrics.history['acc'][-1]) mean_reward_cloned_curr, std_reward_cloned_curr = imitation.test_cloned_policy( env, cloned_policy, num_episodes=50, render=False) mean_reward_clones_list.append(mean_reward_cloned_curr) std_reward_clones_list.append(std_reward_cloned_curr) mean_reward_cloned_curr_wrap, std_reward_cloned_curr_wrap = imitation.test_cloned_policy( env_wrap, cloned_policy, num_episodes=50, render=False) mean_reward_clones_wrap_list.append(mean_reward_cloned_curr_wrap) std_reward_clones_wrap_list.append(std_reward_cloned_curr_wrap) # test expert mean_reward_expert, std_reward_expert = imitation.test_cloned_policy( env, expert, num_episodes=50, render=False) mean_reward_expert_wrap, std_reward_expert_wrap = imitation.test_cloned_policy( env_wrap, expert, num_episodes=50, render=False) print "\n\nExpert stats"
env = gym.make('CartPole-v0') wrapped_env = imitation.wrap_cartpole(env) expert = imitation.load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') model_config_path = 'CartPole-v0_config.yaml' with open(model_config_path, 'r') as f: model_config_yaml = f.read() num_epochs = 50 # Behaviour cloning experiments expts = {} # Expts with expert policy expt_name = 'expert' _, mean_rewards_env, std_rewards_env = imitation.test_cloned_policy( env, expert, render=False) _, mean_rewards_wrapped_env, std_rewards_wrapped_env = imitation.test_cloned_policy( wrapped_env, expert, render=False) expts[expt_name] = { 'loss': 0, 'acc': 1, 'mean_rewards_env': mean_rewards_env, 'std_rewards_env': std_rewards_env, 'mean_rewards_wrapped_env': mean_rewards_wrapped_env, 'std_rewards_wrapped_env': std_rewards_wrapped_env } # Expts with cloning policy for num_eps in [1, 10, 50, 100]: expt_name = "clone_policy_%deps" % num_eps train_data = generate_training_data(env, expert, num_eps) cloned_policy, final_info = train_model(model_config_yaml, train_data,
"--episodes", dest="num_episodes", default=100, help="Number of episodes from expert") if __name__ == '__main__': args = cmdline.parse_args() # Problem 2. print("===== Problem 2.1 =====") obz, act = imitation.generate_expert_training_data(expert, env, num_episodes=int( args.num_episodes), render=False) model = imitation.load_model(expert_yaml) imitation.behavior_cloning(model, obz, act) print("===== Problem 2.2 =====") imitation.test_cloned_policy(env, model, render=False) print("===== Problem 2.3 =====") harder_env = imitation.wrap_cartpole(env) print("> evaluate cloned model") imitation.test_cloned_policy(harder_env, model, render=False) print("> evaluate expert model") imitation.test_cloned_policy(harder_env, expert, render=False) print("===== DAGGER =====") model = imitation.load_model(expert_yaml) imitation.dagger(env, model, expert)
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Prepare expert data with episodes num:', EXPERT_EPISODES) expert_states, expert_actions = generate_expert_training_data( expert, env, num_episodes=EXPERT_EPISODES, render=False) print('Expert data is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() learner.fit(expert_states, expert_actions, epochs=TRAIN_EPOCHS, callbacks=[history]) weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ';' + str(learner_reward_summary[i]) + ';' + str(hard_expert_reward_summary[i]) + ';' + str(hard_learner_reward_summary[i]) + '\n') f.close()
def run_Q2(env, env_hard, EXPERT_EPISODES, TRAIN_EPOCHS, folder_path): file_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.txt' f = open(file_path, 'w') f.write('Parameters:\n') f.write('EXPET_EPISODES:' + str(EXPERT_EPISODES) + '\n') f.write('TRAIN_EPOCHS:' + str(TRAIN_EPOCHS) + '\n') #test all parameters expert = load_model('CartPole-v0_config.yaml', 'CartPole-v0_weights.h5f') learner = load_model('CartPole-v0_config.yaml', None) adam = Adam() expert.compile(adam, 'binary_crossentropy', metrics=['accuracy']) learner.compile(adam, 'binary_crossentropy', metrics=['accuracy']) print('Generate initial data from learner') data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') q_values = expert.predict(data) labels = np.argmax(q_values, axis=1) onehot_labels = np.zeros((labels.shape[0], 2)) for i in range(labels.shape[0]): onehot_labels[i, labels[i]] = 1 #print(onehot_labels) #print(onehot_labels.shape) print('Expert qurey is ready. Start to train learner with epoch num:', TRAIN_EPOCHS) history = LossHistory() train_cnt = 0 while train_cnt < TRAIN_EPOCHS: learner.fit(data, onehot_labels, epochs=1, callbacks=[history]) #generate new data for DAAGER # use the same function as generate expert, but using the learner model new_data, _ = generate_expert_training_data(learner, env, num_episodes=1, render=False) print('Qurey expert for labels ') new_q_values = expert.predict(new_data) new_labels = np.argmax(new_q_values, axis=1) new_onehot_labels = np.zeros((new_labels.shape[0], 2)) for i in range(new_labels.shape[0]): new_onehot_labels[i, new_labels[i]] = 1 data = np.vstack((data, new_data)) onehot_labels = np.vstack((onehot_labels, new_onehot_labels)) print(onehot_labels.shape) train_cnt = train_cnt + 1 weights_path = folder_path + 'Q2_' + str(EXPERT_EPISODES) + '_' + str( TRAIN_EPOCHS) + '.h5' learner.save_weights(weights_path) print('Test expert in normal env.........................................') expert_reward_summary, expert_reward_avg, expert_reward_std = test_cloned_policy( env, expert, num_episodes=100, render=False) print( 'Test learner in normal env.........................................') learner_reward_summary, learner_reward_avg, learner_reward_std = test_cloned_policy( env, learner, num_episodes=100, render=False) print('Test expert in hard Env.........................................') hard_expert_reward_summary, hard_expert_reward_avg, hard_expert_reward_std = test_cloned_policy( env_hard, expert, num_episodes=100, render=False) print('Test learner in hard Env.........................................') hard_learner_reward_summary, hard_learner_reward_avg, hard_learner_reward_std = test_cloned_policy( env_hard, learner, num_episodes=100, render=False) f.write('Expert Test in Normal Env:\n') f.write(str(expert_reward_avg) + ' ' + str(expert_reward_std) + '\n') f.write('Learner Test in Normal Env:\n') f.write(str(learner_reward_avg) + ' ' + str(learner_reward_std) + '\n') f.write('Expert Test in Hard Env:\n') f.write( str(hard_expert_reward_avg) + ' ' + str(hard_expert_reward_std) + '\n') f.write('Learner Test in Hard Env:\n') f.write( str(hard_learner_reward_avg) + ' ' + str(hard_learner_reward_std) + '\n') f.write('Learner Training History:\n') for i in range(TRAIN_EPOCHS): f.write( str(history.losses[i]) + ' ' + str(history.accues[i]) + '\n') f.write('Evaluate History:\n') for i in range(100): f.write( str(expert_reward_summary[i]) + ' ' + str(learner_reward_summary[i]) + ' ' + str(hard_expert_reward_summary[i]) + ' ' + str(hard_learner_reward_summary[i]) + '\n') f.close()