def evaluate_layout_loss_for_pbt_models(pbt_model_paths, layout_name, trajs, eps, seeds, best=True): layout_losses = defaultdict(dict) pbt_save_dir = PBT_DATA_DIR + pbt_model_paths[layout_name] + "/" pbt_config = load_dict_from_txt(pbt_save_dir + "config") for seed in seeds: reset_tf() agent_pbt = get_pbt_agent_from_config(pbt_save_dir, pbt_config["sim_threads"], seed=seed, agent_idx=0, best=best) agent_pbt.action_probs = True agent_pbt.set_mdp( OvercookedGridworld.from_layout_name(**pbt_config["mdp_params"])) losses, accuracies = get_trajs_losses_for_model(trajs, agent_pbt, eps) layout_losses["{}_seed{}".format(layout_name, seed)]['losses'] = losses layout_losses["{}_seed{}".format(layout_name, seed)]['accuracies'] = accuracies return layout_losses
def test_running_ppo_bc_train(self): # Check model exists and has right params layout_name = 'simple' best_bc_model_paths = load_pickle(BEST_BC_MODELS_PATH) bc_model_path = best_bc_model_paths["train"][layout_name] print("LOADING BC MODEL FROM: {}".format(bc_model_path)) _, bc_params = get_bc_agent_from_saved(bc_model_path) expected_bc_params = {'data_params': {'train_mdps': ['simple'], 'ordered_trajs': True, 'human_ai_trajs': False, 'data_path': 'data/human/clean_train_trials.pkl'}, 'mdp_params': {'layout_name': 'simple', 'start_order_list': None}, 'env_params': {'horizon': 400}, 'mdp_fn_params': {}} self.assertDictEqual(expected_bc_params, bc_params) # Run twice with same seed and compare output dicts. Did not do as above because additional dependency on the human model reset_tf() run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': layout_name, 'OTHER_AGENT_TYPE': 'bc_train', 'SEEDS': [10]}) train_info0 = run.result[0] reset_tf() run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': layout_name, 'OTHER_AGENT_TYPE': 'bc_train', 'SEEDS': [10]}) train_info1 = run.result[0] self.assertDictEqual(train_info0, train_info1) # Uncomment to make current output standard output to check against # save_pickle(train_info1, 'data/testing/ppo_bc_train_info') expected_dict = load_pickle('data/testing/ppo_bc_train_info') for k, v in train_info1.items(): for found_item, expected_item in zip(v, expected_dict[k]): self.assertAlmostEqual(found_item, expected_item, places=5)
def train_bc_models(all_params, seeds): """Train len(seeds) num of models for each layout""" for params in all_params: for seed_idx, seed in enumerate(seeds): set_global_seed(seed) model = train_bc_agent_from_hh_data(agent_name="bc_train_seed{}".format(seed_idx), model='train', **params) plot_bc_run(model.bc_info, params['num_epochs']) model = train_bc_agent_from_hh_data(agent_name="bc_test_seed{}".format(seed_idx), model='test', **params) plot_bc_run(model.bc_info, params['num_epochs']) reset_tf()
def ppo_run(params): create_dir_if_not_exists(params["SAVE_DIR"]) save_pickle(params, params["SAVE_DIR"] + "config") ############# # PPO SETUP # ############# train_infos = [] for seed in params["SEEDS"]: reset_tf() set_global_seed(seed) curr_seed_dir = params["SAVE_DIR"] + "seed" + str(seed) + "/" create_dir_if_not_exists(curr_seed_dir) save_pickle(params, curr_seed_dir + "config") print("Creating env with params", params) # Configure mdp mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"]) env = OvercookedEnv(mdp, **params["env_params"]) mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=True) # Configure gym env gym_env = get_vectorized_gym_env( env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params ) gym_env.self_play_randomization = 0 if params["SELF_PLAY_HORIZON"] is None else 1 gym_env.trajectory_sp = params["TRAJECTORY_SELF_PLAY"] gym_env.update_reward_shaping_param(1 if params["mdp_params"]["rew_shaping_params"] != 0 else 0) configure_other_agent(params, gym_env, mlp, mdp) # Create model with tf.device('/device:GPU:{}'.format(params["GPU_ID"])): model = create_model(gym_env, "ppo_agent", **params) # Train model params["CURR_SEED"] = seed train_info = update_model(gym_env, model, **params) # Save model save_ppo_model(model, curr_seed_dir + model.agent_name) print("Saved training info at", curr_seed_dir + "training_info") save_pickle(train_info, curr_seed_dir + "training_info") train_infos.append(train_info) return train_infos
def test_running_ppo_sp(self): reset_tf() run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': 'simple', 'OTHER_AGENT_TYPE': 'sp'}) # Just making sure seeding is working correctly and not changing actual outputs train_info = run.result[0] # Uncomment to make current output standard output to check against # save_pickle(train_info, 'data/testing/ppo_sp_train_info') expected_sp_dict = load_pickle('data/testing/ppo_sp_train_info') for k, v in train_info.items(): for found_item, expected_item in zip(v, expected_sp_dict[k]): self.assertAlmostEqual(found_item, expected_item, places=5)
def evaluate_layout_loss_for_ppo_models(ppo_path, layout_name, trajs, eps, seeds): layout_losses = defaultdict(dict) for seed in seeds: reset_tf() agent_ppo, bc_params = get_ppo_agent(ppo_path, seed, best=True) agent_ppo.action_probs = True agent_ppo.set_mdp( OvercookedGridworld.from_layout_name(**bc_params["mdp_params"])) losses, accuracies = get_trajs_losses_for_model(trajs, agent_ppo, eps) layout_losses["{}_seed{}".format(layout_name, seed)]['losses'] = losses layout_losses["{}_seed{}".format(layout_name, seed)]['accuracies'] = accuracies return layout_losses
def run_all_ppo_hm_experiments(best_bc_model_paths): reset_tf() seeds = [8355, 5748, 1352, 3325, 8611] ppo_hm_model_paths = { "simple": "ppo_hm_simple", "unident_s": "ppo_hm_unident_s", "random1": "ppo_hm_random1", "random3": "ppo_hm_random3" } plot_ppo_hm_training_curves(ppo_hm_model_paths, seeds) set_global_seed(124) num_rounds = 50 ppo_hm_performance = evaluate_all_ppo_hm_models( ppo_hm_model_paths, best_bc_model_paths['test'], num_rounds, seeds, best=True) save_pickle(ppo_hm_performance, PPO_DATA_DIR + "ppo_hm_models_performance")
def run_all_ppo_bc_experiments(best_bc_model_paths): reset_tf() seeds = { "bc_train": [9456, 1887, 5578, 5987, 516], "bc_test": [2888, 7424, 7360, 4467, 184] } ppo_bc_model_paths = { 'bc_train': { "simple": "ppo_bc_train_simple", "unident_s": "ppo_bc_train_unident_s", "random1": "ppo_bc_train_random1", "random0": "ppo_bc_train_random0", "random3": "ppo_bc_train_random3" }, 'bc_test': { "simple": "ppo_bc_test_simple", "unident_s": "ppo_bc_test_unident_s", "random1": "ppo_bc_test_random1", "random0": "ppo_bc_test_random0", "random3": "ppo_bc_test_random3" } } plot_runs_training_curves(ppo_bc_model_paths, seeds, save=True) set_global_seed(248) num_rounds = 100 ppo_bc_performance = evaluate_all_ppo_bc_models(ppo_bc_model_paths, best_bc_model_paths, num_rounds, seeds, best=True) ppo_bc_performance = prepare_nested_default_dict_for_pickle( ppo_bc_performance) save_pickle(ppo_bc_performance, PPO_DATA_DIR + "ppo_bc_models_performance")
def run_all_ppo_sp_experiments(best_bc_model_paths): reset_tf() seeds = [2229, 7649, 7225, 9807, 386] ppo_sp_model_paths = { "simple": "ppo_sp_simple", "unident_s": "ppo_sp_unident_s", "random1": "ppo_sp_random1", "random0": "ppo_sp_random0", "random3": "ppo_sp_random3" } plot_ppo_sp_training_curves(ppo_sp_model_paths, seeds, save=True) set_global_seed(124) num_rounds = 100 ppo_sp_performance = evaluate_all_sp_ppo_models( ppo_sp_model_paths, best_bc_model_paths['test'], num_rounds, seeds, best=True) save_pickle(ppo_sp_performance, PPO_DATA_DIR + "ppo_sp_models_performance")
def setUp(self): reset_tf() set_global_seed(0)
def pbt_one_run(params, seed): # Iterating noptepochs over same batch data but shuffled differently # dividing each batch in `nminibatches` and doing a gradient step for each one create_dir_if_not_exists(params["SAVE_DIR"]) save_dict_to_file(params, params["SAVE_DIR"] + "config") ####### # pbt # ####### mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"]) overcooked_env = OvercookedEnv(mdp, **params["env_params"]) print("Sample training environments:") for _ in range(5): overcooked_env.reset() print(overcooked_env) gym_env = get_vectorized_gym_env( overcooked_env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params) gym_env.update_reward_shaping_param(1.0) # Start reward shaping from 1 annealer = LinearAnnealer(horizon=params["REW_SHAPING_HORIZON"]) # ppo_expert_model = load_model("data/expert_agent/", "agent0", actual_agent_name="agent0") # pbt_expert_model = load_model("data/expert_agent/", "agent2", actual_agent_name="agent2") # AGENT POPULATION INITIALIZATION population_size = params["POPULATION_SIZE"] pbt_population = [] pbt_agent_names = ['agent' + str(i) for i in range(population_size)] for agent_name in pbt_agent_names: agent = PBTAgent(agent_name, params, gym_env=gym_env) # overwrite_model(ppo_expert_model, model) pbt_population.append(agent) print("Initialized agent models") all_pairs = [] for i in range(population_size): for j in range(i + 1, population_size): all_pairs.append((i, j)) # MAIN LOOP def pbt_training(): best_sparse_rew_avg = [-np.Inf] * population_size print(params['NUM_PBT_ITER']) for pbt_iter in range(1, params["NUM_PBT_ITER"] + 1): print("\n\n\nPBT ITERATION NUM {}".format(pbt_iter)) # TRAINING PHASE assert params["ITER_PER_SELECTION"] == population_size**2 pairs_to_train = list( itertools.product(range(population_size), range(population_size))) for sel_iter in range(params["ITER_PER_SELECTION"]): # Randomly select agents to be trained pair_idx = np.random.choice(len(pairs_to_train)) idx0, idx1 = pairs_to_train.pop(pair_idx) pbt_agent0, pbt_agent1 = pbt_population[idx0], pbt_population[ idx1] # Training agent 1, leaving agent 0 fixed print( "Training agent {} ({}) with agent {} ({}) fixed (pbt #{}/{}, sel #{}/{})" .format(idx1, pbt_agent1.num_ppo_runs, idx0, pbt_agent0.num_ppo_runs, pbt_iter, params["NUM_PBT_ITER"], sel_iter, params["ITER_PER_SELECTION"])) agent_env_steps = pbt_agent1.num_ppo_runs * params[ "PPO_RUN_TOT_TIMESTEPS"] reward_shaping_param = annealer.param_value(agent_env_steps) print("Current reward shaping:", reward_shaping_param, "\t Save_dir", params["SAVE_DIR"]) pbt_agent1.logs["reward_shaping"].append(reward_shaping_param) gym_env.update_reward_shaping_param(reward_shaping_param) gym_env.other_agent = pbt_agent0.get_agent() pbt_agent1.update(gym_env) save_folder = params["SAVE_DIR"] + pbt_agent1.agent_name + '/' pbt_agent1.save(save_folder) agent_pair = AgentPair(pbt_agent0.get_agent(), pbt_agent1.get_agent()) overcooked_env.get_rollouts( agent_pair, num_games=1, final_state=True, reward_shaping=reward_shaping_param) assert len(pairs_to_train) == 0 # SELECTION PHASE # Overwrite worst agent with best model's agent, according to # a proxy for generalization performance (avg dense reward across population) print("\nSELECTION PHASE\n") # Dictionary with average returns for each agent when matched with each other agent avg_ep_returns_dict = defaultdict(list) avg_ep_returns_sparse_dict = defaultdict(list) for i, pbt_agent in enumerate(pbt_population): # Saving each agent model at the end of the pbt iteration pbt_agent.update_pbt_iter_logs() if pbt_iter == params["NUM_PBT_ITER"]: save_folder = params[ "SAVE_DIR"] + pbt_agent.agent_name + '/' pbt_agent.save_predictor(save_folder + "pbt_iter{}/".format(pbt_iter)) pbt_agent.save(save_folder + "pbt_iter{}/".format(pbt_iter)) for j in range(i, population_size): # Pairs each agent with all other agents including itself in assessing generalization performance print("Evaluating agent {} and {}".format(i, j)) pbt_agent_other = pbt_population[j] agent_pair = AgentPair(pbt_agent.get_agent(), pbt_agent_other.get_agent()) trajs = overcooked_env.get_rollouts( agent_pair, params["NUM_SELECTION_GAMES"], reward_shaping=reward_shaping_param) dense_rews, sparse_rews, lens = trajs["ep_returns"], trajs[ "ep_returns_sparse"], trajs["ep_lengths"] rew_per_step = np.sum(dense_rews) / np.sum(lens) avg_ep_returns_dict[i].append(rew_per_step) avg_ep_returns_sparse_dict[i].append(sparse_rews) if j != i: avg_ep_returns_dict[j].append(rew_per_step) avg_ep_returns_sparse_dict[j].append(sparse_rews) print("AVG ep rewards dict", avg_ep_returns_dict) for i, pbt_agent in enumerate(pbt_population): pbt_agent.update_avg_rew_per_step_logs(avg_ep_returns_dict[i]) avg_sparse_rew = np.mean(avg_ep_returns_sparse_dict[i]) if avg_sparse_rew > best_sparse_rew_avg[i]: best_sparse_rew_avg[i] = avg_sparse_rew agent_name = pbt_agent.agent_name print( "New best avg sparse rews {} for agent {}, saving...". format(best_sparse_rew_avg, agent_name)) best_save_folder = params[ "SAVE_DIR"] + agent_name + '/best/' delete_dir_if_exists(best_save_folder, verbose=True) pbt_agent.save_predictor(best_save_folder) pbt_agent.save(best_save_folder) # Get best and worst agents when averageing rew per step across all agents best_agent_idx = max( avg_ep_returns_dict, key=lambda key: np.mean(avg_ep_returns_dict[key])) worst_agent_idx = min( avg_ep_returns_dict, key=lambda key: np.mean(avg_ep_returns_dict[key])) # MUTATION PHASE pbt_population[worst_agent_idx].explore_from( pbt_population[best_agent_idx]) print( "Overwrote worst model {} ({} rew) with best model {} ({} rew)" .format(worst_agent_idx, avg_ep_returns_dict[worst_agent_idx], best_agent_idx, avg_ep_returns_dict[best_agent_idx])) best_agent = pbt_population[best_agent_idx].get_agent() best_agent_copy = pbt_population[best_agent_idx].get_agent() agent_pair = AgentPair(best_agent, best_agent_copy) overcooked_env.get_rollouts(agent_pair, num_games=1, final_state=True, display=True, reward_shaping=reward_shaping_param) pbt_training() reset_tf() print(params["SAVE_DIR"])
def setUp(self): reset_tf()