def evaluate_layout_loss_for_pbt_models(pbt_model_paths,
                                        layout_name,
                                        trajs,
                                        eps,
                                        seeds,
                                        best=True):
    layout_losses = defaultdict(dict)

    pbt_save_dir = PBT_DATA_DIR + pbt_model_paths[layout_name] + "/"
    pbt_config = load_dict_from_txt(pbt_save_dir + "config")

    for seed in seeds:
        reset_tf()
        agent_pbt = get_pbt_agent_from_config(pbt_save_dir,
                                              pbt_config["sim_threads"],
                                              seed=seed,
                                              agent_idx=0,
                                              best=best)
        agent_pbt.action_probs = True
        agent_pbt.set_mdp(
            OvercookedGridworld.from_layout_name(**pbt_config["mdp_params"]))

        losses, accuracies = get_trajs_losses_for_model(trajs, agent_pbt, eps)
        layout_losses["{}_seed{}".format(layout_name, seed)]['losses'] = losses
        layout_losses["{}_seed{}".format(layout_name,
                                         seed)]['accuracies'] = accuracies
    return layout_losses
예제 #2
0
    def test_running_ppo_bc_train(self):
        # Check model exists and has right params
        layout_name = 'simple'
        best_bc_model_paths = load_pickle(BEST_BC_MODELS_PATH)
        bc_model_path = best_bc_model_paths["train"][layout_name]

        print("LOADING BC MODEL FROM: {}".format(bc_model_path))
        _, bc_params = get_bc_agent_from_saved(bc_model_path)

        expected_bc_params = {'data_params': {'train_mdps': ['simple'], 'ordered_trajs': True, 'human_ai_trajs': False, 'data_path': 'data/human/clean_train_trials.pkl'}, 'mdp_params': {'layout_name': 'simple', 'start_order_list': None}, 'env_params': {'horizon': 400}, 'mdp_fn_params': {}}
        self.assertDictEqual(expected_bc_params, bc_params)

        # Run twice with same seed and compare output dicts. Did not do as above because additional dependency on the human model

        reset_tf()
        run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': layout_name, 'OTHER_AGENT_TYPE': 'bc_train', 'SEEDS': [10]})
        train_info0 = run.result[0]

        reset_tf()
        run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': layout_name, 'OTHER_AGENT_TYPE': 'bc_train', 'SEEDS': [10]})
        train_info1 = run.result[0]

        self.assertDictEqual(train_info0, train_info1)

        # Uncomment to make current output standard output to check against
        # save_pickle(train_info1, 'data/testing/ppo_bc_train_info')

        expected_dict = load_pickle('data/testing/ppo_bc_train_info')
        for k, v in train_info1.items():
            for found_item, expected_item in zip(v, expected_dict[k]):
                self.assertAlmostEqual(found_item, expected_item, places=5)
예제 #3
0
def train_bc_models(all_params, seeds):
    """Train len(seeds) num of models for each layout"""
    for params in all_params:
        for seed_idx, seed in enumerate(seeds):
            set_global_seed(seed)
            model = train_bc_agent_from_hh_data(agent_name="bc_train_seed{}".format(seed_idx), model='train', **params)
            plot_bc_run(model.bc_info, params['num_epochs'])
            model = train_bc_agent_from_hh_data(agent_name="bc_test_seed{}".format(seed_idx), model='test', **params)
            plot_bc_run(model.bc_info, params['num_epochs'])
            reset_tf()
예제 #4
0
def ppo_run(params):

    create_dir_if_not_exists(params["SAVE_DIR"])
    save_pickle(params, params["SAVE_DIR"] + "config")

    #############
    # PPO SETUP #
    #############

    train_infos = []

    for seed in params["SEEDS"]:
        reset_tf()
        set_global_seed(seed)

        curr_seed_dir = params["SAVE_DIR"] + "seed" + str(seed) + "/"
        create_dir_if_not_exists(curr_seed_dir)

        save_pickle(params, curr_seed_dir + "config")

        print("Creating env with params", params)
        # Configure mdp
        
        mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"])
        env = OvercookedEnv(mdp, **params["env_params"])
        mlp = MediumLevelPlanner.from_pickle_or_compute(mdp, NO_COUNTERS_PARAMS, force_compute=True) 

        # Configure gym env
        gym_env = get_vectorized_gym_env(
            env, 'Overcooked-v0', featurize_fn=lambda x: mdp.lossless_state_encoding(x), **params
        )
        gym_env.self_play_randomization = 0 if params["SELF_PLAY_HORIZON"] is None else 1
        gym_env.trajectory_sp = params["TRAJECTORY_SELF_PLAY"]
        gym_env.update_reward_shaping_param(1 if params["mdp_params"]["rew_shaping_params"] != 0 else 0)

        configure_other_agent(params, gym_env, mlp, mdp)

        # Create model
        with tf.device('/device:GPU:{}'.format(params["GPU_ID"])):
            model = create_model(gym_env, "ppo_agent", **params)

        # Train model
        params["CURR_SEED"] = seed
        train_info = update_model(gym_env, model, **params)
        
        # Save model
        save_ppo_model(model, curr_seed_dir + model.agent_name)
        print("Saved training info at", curr_seed_dir + "training_info")
        save_pickle(train_info, curr_seed_dir + "training_info")
        train_infos.append(train_info)
    
    return train_infos
예제 #5
0
    def test_running_ppo_sp(self):
        reset_tf()

        run = ex_ppo.run(config_updates={'LOCAL_TESTING': True, 'layout_name': 'simple', 'OTHER_AGENT_TYPE': 'sp'})
        # Just making sure seeding is working correctly and not changing actual outputs
        train_info = run.result[0]

        # Uncomment to make current output standard output to check against
        # save_pickle(train_info, 'data/testing/ppo_sp_train_info')

        expected_sp_dict = load_pickle('data/testing/ppo_sp_train_info')
        for k, v in train_info.items():
            for found_item, expected_item in zip(v, expected_sp_dict[k]):
                self.assertAlmostEqual(found_item, expected_item, places=5)
def evaluate_layout_loss_for_ppo_models(ppo_path, layout_name, trajs, eps,
                                        seeds):
    layout_losses = defaultdict(dict)
    for seed in seeds:
        reset_tf()
        agent_ppo, bc_params = get_ppo_agent(ppo_path, seed, best=True)
        agent_ppo.action_probs = True
        agent_ppo.set_mdp(
            OvercookedGridworld.from_layout_name(**bc_params["mdp_params"]))

        losses, accuracies = get_trajs_losses_for_model(trajs, agent_ppo, eps)
        layout_losses["{}_seed{}".format(layout_name, seed)]['losses'] = losses
        layout_losses["{}_seed{}".format(layout_name,
                                         seed)]['accuracies'] = accuracies
    return layout_losses
예제 #7
0
def run_all_ppo_hm_experiments(best_bc_model_paths):
    reset_tf()

    seeds = [8355, 5748, 1352, 3325, 8611]

    ppo_hm_model_paths = {
        "simple": "ppo_hm_simple",
        "unident_s": "ppo_hm_unident_s",
        "random1": "ppo_hm_random1",
        "random3": "ppo_hm_random3"
    }

    plot_ppo_hm_training_curves(ppo_hm_model_paths, seeds)

    set_global_seed(124)
    num_rounds = 50
    ppo_hm_performance = evaluate_all_ppo_hm_models(
        ppo_hm_model_paths,
        best_bc_model_paths['test'],
        num_rounds,
        seeds,
        best=True)
    save_pickle(ppo_hm_performance, PPO_DATA_DIR + "ppo_hm_models_performance")
예제 #8
0
def run_all_ppo_bc_experiments(best_bc_model_paths):
    reset_tf()

    seeds = {
        "bc_train": [9456, 1887, 5578, 5987, 516],
        "bc_test": [2888, 7424, 7360, 4467, 184]
    }

    ppo_bc_model_paths = {
        'bc_train': {
            "simple": "ppo_bc_train_simple",
            "unident_s": "ppo_bc_train_unident_s",
            "random1": "ppo_bc_train_random1",
            "random0": "ppo_bc_train_random0",
            "random3": "ppo_bc_train_random3"
        },
        'bc_test': {
            "simple": "ppo_bc_test_simple",
            "unident_s": "ppo_bc_test_unident_s",
            "random1": "ppo_bc_test_random1",
            "random0": "ppo_bc_test_random0",
            "random3": "ppo_bc_test_random3"
        }
    }

    plot_runs_training_curves(ppo_bc_model_paths, seeds, save=True)

    set_global_seed(248)
    num_rounds = 100
    ppo_bc_performance = evaluate_all_ppo_bc_models(ppo_bc_model_paths,
                                                    best_bc_model_paths,
                                                    num_rounds,
                                                    seeds,
                                                    best=True)
    ppo_bc_performance = prepare_nested_default_dict_for_pickle(
        ppo_bc_performance)
    save_pickle(ppo_bc_performance, PPO_DATA_DIR + "ppo_bc_models_performance")
예제 #9
0
def run_all_ppo_sp_experiments(best_bc_model_paths):
    reset_tf()

    seeds = [2229, 7649, 7225, 9807, 386]

    ppo_sp_model_paths = {
        "simple": "ppo_sp_simple",
        "unident_s": "ppo_sp_unident_s",
        "random1": "ppo_sp_random1",
        "random0": "ppo_sp_random0",
        "random3": "ppo_sp_random3"
    }

    plot_ppo_sp_training_curves(ppo_sp_model_paths, seeds, save=True)

    set_global_seed(124)
    num_rounds = 100
    ppo_sp_performance = evaluate_all_sp_ppo_models(
        ppo_sp_model_paths,
        best_bc_model_paths['test'],
        num_rounds,
        seeds,
        best=True)
    save_pickle(ppo_sp_performance, PPO_DATA_DIR + "ppo_sp_models_performance")
 def setUp(self):
     reset_tf()
     set_global_seed(0)
예제 #11
0
def pbt_one_run(params, seed):
    # Iterating noptepochs over same batch data but shuffled differently
    # dividing each batch in `nminibatches` and doing a gradient step for each one
    create_dir_if_not_exists(params["SAVE_DIR"])
    save_dict_to_file(params, params["SAVE_DIR"] + "config")

    #######
    # pbt #
    #######

    mdp = OvercookedGridworld.from_layout_name(**params["mdp_params"])
    overcooked_env = OvercookedEnv(mdp, **params["env_params"])

    print("Sample training environments:")
    for _ in range(5):
        overcooked_env.reset()
        print(overcooked_env)

    gym_env = get_vectorized_gym_env(
        overcooked_env,
        'Overcooked-v0',
        featurize_fn=lambda x: mdp.lossless_state_encoding(x),
        **params)
    gym_env.update_reward_shaping_param(1.0)  # Start reward shaping from 1

    annealer = LinearAnnealer(horizon=params["REW_SHAPING_HORIZON"])

    # ppo_expert_model = load_model("data/expert_agent/", "agent0", actual_agent_name="agent0")
    # pbt_expert_model = load_model("data/expert_agent/", "agent2", actual_agent_name="agent2")

    # AGENT POPULATION INITIALIZATION
    population_size = params["POPULATION_SIZE"]
    pbt_population = []
    pbt_agent_names = ['agent' + str(i) for i in range(population_size)]
    for agent_name in pbt_agent_names:
        agent = PBTAgent(agent_name, params, gym_env=gym_env)

        # overwrite_model(ppo_expert_model, model)

        pbt_population.append(agent)

    print("Initialized agent models")

    all_pairs = []
    for i in range(population_size):
        for j in range(i + 1, population_size):
            all_pairs.append((i, j))

    # MAIN LOOP

    def pbt_training():
        best_sparse_rew_avg = [-np.Inf] * population_size
        print(params['NUM_PBT_ITER'])
        for pbt_iter in range(1, params["NUM_PBT_ITER"] + 1):
            print("\n\n\nPBT ITERATION NUM {}".format(pbt_iter))

            # TRAINING PHASE
            assert params["ITER_PER_SELECTION"] == population_size**2
            pairs_to_train = list(
                itertools.product(range(population_size),
                                  range(population_size)))

            for sel_iter in range(params["ITER_PER_SELECTION"]):
                # Randomly select agents to be trained
                pair_idx = np.random.choice(len(pairs_to_train))
                idx0, idx1 = pairs_to_train.pop(pair_idx)
                pbt_agent0, pbt_agent1 = pbt_population[idx0], pbt_population[
                    idx1]

                # Training agent 1, leaving agent 0 fixed
                print(
                    "Training agent {} ({}) with agent {} ({}) fixed (pbt #{}/{}, sel #{}/{})"
                    .format(idx1, pbt_agent1.num_ppo_runs, idx0,
                            pbt_agent0.num_ppo_runs, pbt_iter,
                            params["NUM_PBT_ITER"], sel_iter,
                            params["ITER_PER_SELECTION"]))

                agent_env_steps = pbt_agent1.num_ppo_runs * params[
                    "PPO_RUN_TOT_TIMESTEPS"]
                reward_shaping_param = annealer.param_value(agent_env_steps)
                print("Current reward shaping:", reward_shaping_param,
                      "\t Save_dir", params["SAVE_DIR"])
                pbt_agent1.logs["reward_shaping"].append(reward_shaping_param)
                gym_env.update_reward_shaping_param(reward_shaping_param)

                gym_env.other_agent = pbt_agent0.get_agent()
                pbt_agent1.update(gym_env)

                save_folder = params["SAVE_DIR"] + pbt_agent1.agent_name + '/'
                pbt_agent1.save(save_folder)

                agent_pair = AgentPair(pbt_agent0.get_agent(),
                                       pbt_agent1.get_agent())
                overcooked_env.get_rollouts(
                    agent_pair,
                    num_games=1,
                    final_state=True,
                    reward_shaping=reward_shaping_param)

            assert len(pairs_to_train) == 0

            # SELECTION PHASE
            # Overwrite worst agent with best model's agent, according to
            # a proxy for generalization performance (avg dense reward across population)
            print("\nSELECTION PHASE\n")

            # Dictionary with average returns for each agent when matched with each other agent
            avg_ep_returns_dict = defaultdict(list)
            avg_ep_returns_sparse_dict = defaultdict(list)

            for i, pbt_agent in enumerate(pbt_population):
                # Saving each agent model at the end of the pbt iteration
                pbt_agent.update_pbt_iter_logs()

                if pbt_iter == params["NUM_PBT_ITER"]:
                    save_folder = params[
                        "SAVE_DIR"] + pbt_agent.agent_name + '/'
                    pbt_agent.save_predictor(save_folder +
                                             "pbt_iter{}/".format(pbt_iter))
                    pbt_agent.save(save_folder +
                                   "pbt_iter{}/".format(pbt_iter))

                for j in range(i, population_size):
                    # Pairs each agent with all other agents including itself in assessing generalization performance
                    print("Evaluating agent {} and {}".format(i, j))
                    pbt_agent_other = pbt_population[j]

                    agent_pair = AgentPair(pbt_agent.get_agent(),
                                           pbt_agent_other.get_agent())
                    trajs = overcooked_env.get_rollouts(
                        agent_pair,
                        params["NUM_SELECTION_GAMES"],
                        reward_shaping=reward_shaping_param)
                    dense_rews, sparse_rews, lens = trajs["ep_returns"], trajs[
                        "ep_returns_sparse"], trajs["ep_lengths"]
                    rew_per_step = np.sum(dense_rews) / np.sum(lens)
                    avg_ep_returns_dict[i].append(rew_per_step)
                    avg_ep_returns_sparse_dict[i].append(sparse_rews)
                    if j != i:
                        avg_ep_returns_dict[j].append(rew_per_step)
                        avg_ep_returns_sparse_dict[j].append(sparse_rews)

            print("AVG ep rewards dict", avg_ep_returns_dict)

            for i, pbt_agent in enumerate(pbt_population):
                pbt_agent.update_avg_rew_per_step_logs(avg_ep_returns_dict[i])

                avg_sparse_rew = np.mean(avg_ep_returns_sparse_dict[i])
                if avg_sparse_rew > best_sparse_rew_avg[i]:
                    best_sparse_rew_avg[i] = avg_sparse_rew
                    agent_name = pbt_agent.agent_name
                    print(
                        "New best avg sparse rews {} for agent {}, saving...".
                        format(best_sparse_rew_avg, agent_name))
                    best_save_folder = params[
                        "SAVE_DIR"] + agent_name + '/best/'
                    delete_dir_if_exists(best_save_folder, verbose=True)
                    pbt_agent.save_predictor(best_save_folder)
                    pbt_agent.save(best_save_folder)

            # Get best and worst agents when averageing rew per step across all agents
            best_agent_idx = max(
                avg_ep_returns_dict,
                key=lambda key: np.mean(avg_ep_returns_dict[key]))
            worst_agent_idx = min(
                avg_ep_returns_dict,
                key=lambda key: np.mean(avg_ep_returns_dict[key]))

            # MUTATION PHASE

            pbt_population[worst_agent_idx].explore_from(
                pbt_population[best_agent_idx])
            print(
                "Overwrote worst model {} ({} rew) with best model {} ({} rew)"
                .format(worst_agent_idx, avg_ep_returns_dict[worst_agent_idx],
                        best_agent_idx, avg_ep_returns_dict[best_agent_idx]))

            best_agent = pbt_population[best_agent_idx].get_agent()
            best_agent_copy = pbt_population[best_agent_idx].get_agent()
            agent_pair = AgentPair(best_agent, best_agent_copy)
            overcooked_env.get_rollouts(agent_pair,
                                        num_games=1,
                                        final_state=True,
                                        display=True,
                                        reward_shaping=reward_shaping_param)

    pbt_training()
    reset_tf()
    print(params["SAVE_DIR"])
예제 #12
0
 def setUp(self):
     reset_tf()