def get_agent_pair_trajs(self,
                          a0,
                          a1=None,
                          num_games=100,
                          game_length=None,
                          start_state_fn=None,
                          display=False,
                          info=True):
     """Evaluate agent pair on both indices, and return trajectories by index"""
     if a1 is None:
         ap = AgentPair(a0, a0, allow_duplicate_agents=True)
         trajs_0 = trajs_1 = self.evaluate_agent_pair(
             ap,
             num_games=num_games,
             game_length=game_length,
             start_state_fn=start_state_fn,
             display=display,
             info=info)
     else:
         trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1),
                                            num_games=num_games,
                                            game_length=game_length,
                                            start_state_fn=start_state_fn,
                                            display=display,
                                            info=info)
         trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0),
                                            num_games=num_games,
                                            game_length=game_length,
                                            start_state_fn=start_state_fn,
                                            display=display,
                                            info=info)
     return trajs_0, trajs_1
예제 #2
0
def evaluate_ppo_hm_and_bc(layout,
                           ppo_hm_path,
                           bc_test_path,
                           num_rounds,
                           seeds,
                           best=False,
                           display=False):
    ppo_hm_performance = defaultdict(lambda: defaultdict(list))

    agent_bc_test, bc_params = get_bc_agent_from_saved(bc_test_path)
    del bc_params["data_params"]
    del bc_params["mdp_fn_params"]
    evaluator = AgentEvaluator(**bc_params)

    for seed in seeds:
        agent_ppo, _ = get_ppo_agent(ppo_hm_path, seed, best=best)

        ppo_and_bc = evaluator.evaluate_agent_pair(AgentPair(
            agent_ppo, agent_bc_test),
                                                   num_games=num_rounds,
                                                   display=display)
        avg_ppo_and_bc = np.mean(ppo_and_bc['ep_returns'])
        ppo_hm_performance[layout]["PPO_HM+BC_test_0"].append(avg_ppo_and_bc)

        bc_and_ppo = evaluator.evaluate_agent_pair(AgentPair(
            agent_bc_test, agent_ppo),
                                                   num_games=num_rounds,
                                                   display=display)
        avg_bc_and_ppo = np.mean(bc_and_ppo['ep_returns'])
        ppo_hm_performance[layout]["PPO_HM+BC_test_1"].append(avg_bc_and_ppo)

    return ppo_hm_performance
예제 #3
0
    def test_mdp_serialization(self):
        # Where to store serialized states -- will be overwritten each timestep
        dummy_path = os.path.join(TESTING_DATA_DIR, 'test_mdp_serialization',
                                  'dummy.json')

        # Get starting seed and random agent pair
        seed = 47
        random_pair = AgentPair(RandomAgent(all_actions=True),
                                RandomAgent(all_actions=True))

        # Run rollouts with different seeds until sparse reward is achieved
        sparse_reward = 0
        while sparse_reward <= 0:
            np.random.seed(seed)
            state = self.base_mdp.get_standard_start_state()
            for _ in range(1500):
                # Ensure serialization and deserializations are inverses
                reconstructed_state = OvercookedState.from_dict(
                    load_from_json(save_as_json(state.to_dict(), dummy_path)))
                self.assertEqual(
                    state, reconstructed_state,
                    "\nState: \t\t\t{}\nReconstructed State: \t{}".format(
                        state, reconstructed_state))

                # Advance state
                joint_action, _ = zip(*random_pair.joint_action(state))
                state, infos = self.base_mdp.get_state_transition(
                    state, joint_action)
                sparse_reward += sum(infos['sparse_reward_by_agent'])
            seed += 1
예제 #4
0
    def test_scenario_1_s(self):
        # Smaller version of the corridor collisions scenario above
        # to facilitate DRL training
        scenario_1_mdp = OvercookedGridworld.from_layout_name(
            'scenario1_s', start_order_list=['any'], cook_time=5)
        mlp = MediumLevelPlanner.from_pickle_or_compute(
            scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
        a0 = GreedyHumanModel(mlp)
        a1 = CoupledPlanningAgent(mlp)
        agent_pair = AgentPair(a0, a1)
        start_state = OvercookedState(
            [P((2, 1), s, Obj('onion', (2, 1))),
             P((4, 2), s)], {},
            order_list=['onion'])
        env = OvercookedEnv.from_mdp(scenario_1_mdp,
                                     start_state_fn=lambda: start_state)
        trajectory, time_taken_hr, _, _ = env.run_agents(
            agent_pair, include_final_state=True, display=DISPLAY)
        env.reset()

        print("\n" * 5)
        print("-" * 50)

        a0 = CoupledPlanningAgent(mlp)
        a1 = CoupledPlanningAgent(mlp)
        agent_pair = AgentPair(a0, a1)
        trajectory, time_taken_rr, _, _ = env.run_agents(
            agent_pair, include_final_state=True, display=DISPLAY)

        print("H+R time taken: ", time_taken_hr)
        print("R+R time taken: ", time_taken_rr)
        self.assertGreater(time_taken_hr, time_taken_rr)
예제 #5
0
def evaluate_bc_models(bc_model_paths, num_rounds):
    """
    Evaluate BC models passed in over `num_rounds` rounds
    """
    best_bc_models_performance = {}

    # Evaluate best
    for layout_name in bc_model_paths['train'].keys():
        print(layout_name)
        best_bc_models_performance[layout_name] = {}
        
        eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['train'][layout_name])
        best_bc_models_performance[layout_name]["BC_train+BC_train"] = mean_and_std_err(eval_trajs['ep_returns'])
        
        eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['test'][layout_name])
        best_bc_models_performance[layout_name]["BC_test+BC_test"] = mean_and_std_err(eval_trajs['ep_returns'])

        bc_train, bc_params_train = get_bc_agent_from_saved(bc_model_paths['train'][layout_name])
        bc_test, bc_params_test = get_bc_agent_from_saved(bc_model_paths['test'][layout_name])
        del bc_params_train["data_params"]
        del bc_params_test["data_params"]
        assert common_keys_equal(bc_params_train, bc_params_test)
        ae = AgentEvaluator(mdp_params=bc_params_train["mdp_params"], env_params=bc_params_train["env_params"])
        
        train_and_test = ae.evaluate_agent_pair(AgentPair(bc_train, bc_test), num_games=num_rounds)
        best_bc_models_performance[layout_name]["BC_train+BC_test_0"] = mean_and_std_err(train_and_test['ep_returns'])

        test_and_train = ae.evaluate_agent_pair(AgentPair(bc_test, bc_train), num_games=num_rounds)
        best_bc_models_performance[layout_name]["BC_train+BC_test_1"] = mean_and_std_err(test_and_train['ep_returns'])
    
    return best_bc_models_performance
예제 #6
0
파일: ppo.py 프로젝트: 51616/human_aware_rl
def match_ppo_with_other_agent(save_dir, other_agent, n=1, display=False):
    agent, agent_eval = get_ppo_agent(save_dir)
    ap0 = AgentPair(agent, other_agent)
    agent_eval.evaluate_agent_pair(ap0, display=display, num_games=n)

    # Sketch switch
    ap1 = AgentPair(other_agent, agent)
    agent_eval.evaluate_agent_pair(ap1, display=display, num_games=n)
예제 #7
0
 def evaluate_one_optimal_one_greedy_human(self,
                                           num_games,
                                           h_idx=0,
                                           display=True):
     h = GreedyHumanModel(self.mlp)
     r = CoupledPlanningAgent(self.mlp)
     agent_pair = AgentPair(h, r) if h_idx == 0 else AgentPair(r, h)
     return self.evaluate_agent_pair(agent_pair,
                                     num_games=num_games,
                                     display=display)
예제 #8
0
 def get_agent_pair_trajs(self, a0, a1=None, num_games=100, display=False):
     """Evaluate agent pair on both indices, and return trajectories by index"""
     if a1 is None:
         ap = AgentPair(a0, a0, allow_duplicate_agents=True)
         trajs_0 = trajs_1 = self.evaluate_agent_pair(ap,
                                                      num_games=num_games,
                                                      display=display)
     else:
         trajs_0 = self.evaluate_agent_pair(AgentPair(a0, a1),
                                            num_games=num_games,
                                            display=display)
         trajs_1 = self.evaluate_agent_pair(AgentPair(a1, a0),
                                            num_games=num_games,
                                            display=display)
     return trajs_0, trajs_1
예제 #9
0
 def evaluate_optimal_pair(self, display=True, delivery_horizon=2):
     a0 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon)
     a1 = CoupledPlanningAgent(self.mlp, delivery_horizon=delivery_horizon)
     a0.mlp.env = self.env
     a1.mlp.env = self.env
     agent_pair = AgentPair(a0, a1)
     return self.evaluate_agent_pair(agent_pair, display=display)
예제 #10
0
 def evaluate_human_model_pair(self, display=True, num_games=1):
     a0 = GreedyHumanModel(self.mlp)
     a1 = GreedyHumanModel(self.mlp)
     agent_pair = AgentPair(a0, a1)
     return self.evaluate_agent_pair(agent_pair,
                                     display=display,
                                     num_games=num_games)
예제 #11
0
    def test_slowed_down_agent(self):
        def should_stop(step_num, stop_every_n_steps):
            # currently SlowDownAgent always stops at 2nd step 
            return not bool((i-1) % stop_every_n_steps)

        horizon = 100
        #NOTE: if stop_every_n_steps is 3 this would not work because of rounding error
        #   (ok for practical purposes, will just skip turn later but would fail test below)
        for stop_every_n_steps in [2, 4]:
            slowdown_rate = 1 - 1/stop_every_n_steps
            
            agent_pair = AgentPair(
                SlowedDownAgent(RandomAgent(), slowdown_rate), 
                SlowedDownAgent(RandomAgent(), slowdown_rate)
                )
            skip_action_probs = SlowedDownAgent(RandomAgent()).skip_action[1]["action_probs"].tolist()
            env = OvercookedEnv.from_mdp(large_mdp, horizon=horizon)
            trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)

            for i, traj_step in enumerate(trajectory):
                (s_t, a_t, r_t, done, info) = traj_step 
                if not done:
                    agent_0_probs = info["agent_infos"][0]["action_probs"]
                    agent_1_probs = info["agent_infos"][1]["action_probs"]
                    if should_stop(i, stop_every_n_steps):
                        self.assertEqual(agent_0_probs.tolist(), skip_action_probs)
                        self.assertEqual(agent_1_probs.tolist(), skip_action_probs)
                    else:
                        self.assertNotEqual(agent_0_probs.tolist(), skip_action_probs)
                        self.assertNotEqual(agent_1_probs.tolist(), skip_action_probs)
예제 #12
0
 def test_scenario_1(self):
     # Myopic corridor collision
     #
     # X X X X X O X D X X X X X
     # X   ↓Ho     X           X
     # X     X X X X X X X ↓R  X
     # X                       X
     # X S X X X X X X X X P P X
     #
     # H on left with onion, further away to the tunnel entrance than R.
     # Optimal planner tells R to go first and that H will wait
     # for R to pass. H however, starts going through the tunnel
     # and they get stuck. The H plan is a bit extreme (it would probably
     # realize that it should retrace it's steps at some point)
     scenario_1_mdp = OvercookedGridworld.from_layout_name(
         'small_corridor', start_order_list=['any'], cook_time=5)
     mlp = MediumLevelPlanner.from_pickle_or_compute(
         scenario_1_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
     a0 = GreedyHumanModel(mlp)
     a1 = CoupledPlanningAgent(mlp)
     agent_pair = AgentPair(a0, a1)
     start_state = OvercookedState(
         [P((2, 1), s, Obj('onion', (2, 1))),
          P((10, 2), s)], {},
         order_list=['onion'])
     env = OvercookedEnv.from_mdp(scenario_1_mdp,
                                  start_state_fn=lambda: start_state)
     env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
예제 #13
0
 def evaluate_one_optimal_one_random(self, num_games, display=True):
     a0 = CoupledPlanningAgent(self.mlp)
     a1 = RandomAgent()
     agent_pair = AgentPair(a0, a1)
     return self.evaluate_agent_pair(agent_pair,
                                     num_games=num_games,
                                     display=display)
예제 #14
0
 def test_rollouts(self):
     ap = AgentPair(RandomAgent(), RandomAgent())
     trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5)
     try:
         AgentEvaluator.check_trajectories(trajs)
     except AssertionError as e:
         self.fail("Trajectories were not returned in standard format:\n{}".format(e))
예제 #15
0
 def setUp(self):
     self.base_mdp = OvercookedGridworld.from_layout_name("cramped_room")
     self.mlp = MediumLevelPlanner.from_pickle_or_compute(
         self.base_mdp, NO_COUNTERS_PARAMS, force_compute=True)
     self.env = OvercookedEnv(self.base_mdp, **DEFAULT_ENV_PARAMS)
     self.rnd_agent_pair = AgentPair(GreedyHumanModel(self.mlp),
                                     GreedyHumanModel(self.mlp))
     np.random.seed(0)
예제 #16
0
 def evaluate_random_pair(self,
                          num_games=1,
                          all_actions=True,
                          display=False):
     agent_pair = AgentPair(RandomAgent(all_actions=all_actions),
                            RandomAgent(all_actions=all_actions))
     return self.evaluate_agent_pair(agent_pair,
                                     num_games=num_games,
                                     display=display)
def eval_with_benchmarking_from_model(n_games, model, bc_params, no_waits, display=False):
    bc_params = copy.deepcopy(bc_params)
    a0 = get_bc_agent_from_model(model, bc_params, no_waits)
    a1 = get_bc_agent_from_model(model, bc_params, no_waits)
    del bc_params["data_params"], bc_params["mdp_fn_params"]
    a_eval = AgentEvaluator(**bc_params)
    ap = AgentPair(a0, a1)
    trajectories = a_eval.evaluate_agent_pair(ap, num_games=n_games, display=display)
    return trajectories
예제 #18
0
 def test_fixed_plan_agents(self):
     a0 = FixedPlanAgent([s, e, n, w])
     a1 = FixedPlanAgent([s, w, n, e])
     agent_pair = AgentPair(a0, a1)
     env = OvercookedEnv.from_mdp(large_mdp, horizon=10)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
     end_state = trajectory[-1][0]
     self.assertEqual(time_taken, 10)
     self.assertEqual(env.mdp.get_standard_start_state().player_positions, end_state.player_positions)
예제 #19
0
 def test_one_coupled_one_fixed(self):
     a0 = CoupledPlanningAgent(self.mlp_large)
     a1 = FixedPlanAgent([s, e, n, w])
     agent_pair = AgentPair(a0, a1)
     env = OvercookedEnv.from_mdp(large_mdp, horizon=10)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair,
                                                   include_final_state=True,
                                                   display=DISPLAY)
     self.assertEqual(time_taken, 10)
예제 #20
0
    def test_agents_on_open_map(self):
        scenario_2_mdp = OvercookedGridworld.from_layout_name('scenario2')
        mlam = MediumLevelActionManager.from_pickle_or_compute(scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
        agent_pairs = [
            AgentPair(GreedyHumanModel(mlam), GreedyHumanModel(mlam)),
            AgentPair(SimpleGreedyHumanModel(mlam), SimpleGreedyHumanModel(mlam)),
            AgentPair(RandomAgent(all_actions=True), RandomAgent(all_actions=True)),
            AgentPair(RandomAgent(all_actions=False), RandomAgent(all_actions=False))
        ]

        start_state = OvercookedState(
            [P((8, 1), s),
             P((1, 1), s)],
            {},
            all_orders=scenario_2_mdp.start_all_orders
        )
        for agent_pair in agent_pairs:
            env = OvercookedEnv.from_mdp(scenario_2_mdp, start_state_fn=lambda: start_state, horizon=100)
            trajectory, time_taken, _, _ = env.run_agents(agent_pair, include_final_state=True, display=DISPLAY)
예제 #21
0
 def test_embedded_planning_agent(self):
     agent_evaluator = AgentEvaluator({"layout_name": "cramped_room"},
                                      {"horizon": 100})
     other_agent = GreedyHumanModel(agent_evaluator.mlp)
     epa = EmbeddedPlanningAgent(other_agent,
                                 agent_evaluator.mlp,
                                 agent_evaluator.env,
                                 delivery_horizon=1)
     ap = AgentPair(epa, other_agent)
     agent_evaluator.evaluate_agent_pair(ap, num_games=1, display=DISPLAY)
예제 #22
0
    def repetative_runs(self, evaluator, num_games=10):
        trajectory_0 = evaluator.evaluate_human_model_pair(num_games=num_games,
                                                           native_eval=True)
        trajectory_1 = evaluator.evaluate_human_model_pair(num_games=num_games,
                                                           native_eval=True)

        h0 = GreedyHumanModel(evaluator.env.mlam)
        h1 = GreedyHumanModel(evaluator.env.mlam)
        ap_hh_2 = AgentPair(h0, h1)
        trajectory_2 = evaluator.evaluate_agent_pair(agent_pair=ap_hh_2,
                                                     num_games=num_games,
                                                     native_eval=True)

        h3 = GreedyHumanModel(evaluator.env.mlam)
        h4 = GreedyHumanModel(evaluator.env.mlam)
        ap_hh_3 = AgentPair(h3, h4)
        trajectory_3 = evaluator.evaluate_agent_pair(agent_pair=ap_hh_3,
                                                     num_games=num_games,
                                                     native_eval=True)
 def evaluate_human_model_pair(self,
                               num_games=1,
                               display=False,
                               native_eval=False):
     a0 = GreedyHumanModel(self.env.mlam)
     a1 = GreedyHumanModel(self.env.mlam)
     agent_pair = AgentPair(a0, a1)
     return self.evaluate_agent_pair(agent_pair,
                                     num_games=num_games,
                                     display=display,
                                     native_eval=native_eval)
예제 #24
0
def eval_pbt_over_seeds(pbt_agents, bc_agent, layout_name, num_rounds,
                        pbt_performance, agent_evaluator):
    ae = agent_evaluator
    for i in range(len(pbt_agents)):
        pbt_and_pbt = ae.evaluate_agent_pair(AgentPair(
            pbt_agents[i], pbt_agents[i], allow_duplicate_agents=True),
                                             num_games=num_rounds)
        avg_pbt_and_pbt = np.mean(pbt_and_pbt['ep_returns'])
        pbt_performance[layout_name]["PBT+PBT"].append(avg_pbt_and_pbt)

        pbt_and_bc = ae.evaluate_agent_pair(AgentPair(pbt_agents[i], bc_agent),
                                            num_games=num_rounds)
        avg_pbt_and_bc = np.mean(pbt_and_bc['ep_returns'])
        pbt_performance[layout_name]["PBT+BC_0"].append(avg_pbt_and_bc)

        bc_and_pbt = ae.evaluate_agent_pair(AgentPair(bc_agent, pbt_agents[i]),
                                            num_games=num_rounds)
        avg_bc_and_pbt = np.mean(bc_and_pbt['ep_returns'])
        pbt_performance[layout_name]["PBT+BC_1"].append(avg_bc_and_pbt)
    return pbt_performance
예제 #25
0
def evaluate(eval_params,
             mdp_params,
             outer_shape,
             agent_0_policy,
             agent_1_policy,
             agent_0_featurize_fn=None,
             agent_1_featurize_fn=None,
             verbose=False):
    """
    Used to visualize rollouts of trained policies

    eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts
    mdp_params (dict): OvercookedMDP compatible configuration used to create environment used for evaluation
    outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout
    agent_0_policy (rllib.Policy): Policy instance used to map states to action logits for agent 0
    agent_1_policy (rllib.Policy): Policy instance used to map states to action logits for agent 1
    agent_0_featurize_fn (func): Used to preprocess states for agent 0, defaults to lossless_state_encoding if 'None'
    agent_1_featurize_fn (func): Used to preprocess states for agent 1, defaults to lossless_state_encoding if 'None'
    """
    if verbose:
        print("eval mdp params", mdp_params)
    evaluator = get_base_ae(mdp_params, {
        "horizon": eval_params['ep_length'],
        "num_mdp": 1
    }, outer_shape)

    # Override pre-processing functions with defaults if necessary
    agent_0_featurize_fn = agent_0_featurize_fn if agent_0_featurize_fn else evaluator.env.lossless_state_encoding_mdp
    agent_1_featurize_fn = agent_1_featurize_fn if agent_1_featurize_fn else evaluator.env.lossless_state_encoding_mdp

    # Wrap rllib policies in overcooked agents to be compatible with Evaluator code
    agent0 = RlLibAgent(agent_0_policy,
                        agent_index=0,
                        featurize_fn=agent_0_featurize_fn)
    agent1 = RlLibAgent(agent_1_policy,
                        agent_index=1,
                        featurize_fn=agent_1_featurize_fn)

    # Compute rollouts
    if 'store_dir' not in eval_params:
        eval_params['store_dir'] = None
    if 'display_phi' not in eval_params:
        eval_params['display_phi'] = False
    results = evaluator.evaluate_agent_pair(
        AgentPair(agent0, agent1),
        num_games=eval_params['num_games'],
        display=eval_params['display'],
        dir=eval_params['store_dir'],
        display_phi=eval_params['display_phi'],
        info=verbose)

    return results
예제 #26
0
 def test_two_coupled_agents(self):
     a0 = CoupledPlanningAgent(self.mlp_large)
     a1 = CoupledPlanningAgent(self.mlp_large)
     agent_pair = AgentPair(a0, a1)
     start_state = OvercookedState([P(
         (2, 2), n), P((2, 1), n)], {},
                                   order_list=['any'])
     env = OvercookedEnv(large_mdp, start_state_fn=lambda: start_state)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair,
                                                   include_final_state=True,
                                                   display=DISPLAY)
     end_state = trajectory[-1][0]
     self.assertEqual(end_state.order_list, [])
예제 #27
0
 def test_two_greedy_human_open_map(self):
     scenario_2_mdp = OvercookedGridworld.from_layout_name('scenario2')
     mlam = MediumLevelActionManager.from_pickle_or_compute(
         scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
     a0 = GreedyHumanModel(mlam)
     a1 = GreedyHumanModel(mlam)
     agent_pair = AgentPair(a0, a1)
     start_state = OvercookedState(
         [P((8, 1), s), P((1, 1), s)], {},
         all_orders=scenario_2_mdp.start_all_orders)
     env = OvercookedEnv.from_mdp(scenario_2_mdp,
                                  start_state_fn=lambda: start_state,
                                  horizon=100)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair,
                                                   include_final_state=True,
                                                   display=DISPLAY)
예제 #28
0
 def test_one_coupled_one_greedy_human(self):
     # Even though in the first ~10 timesteps it seems like agent 1 is wasting time
     # it turns out that this is actually not suboptimal as the true bottleneck is
     # going to be agent 0 later on (when it goes to get the 3rd onion)
     a0 = GreedyHumanModel(self.mlp_large)
     a1 = CoupledPlanningAgent(self.mlp_large)
     agent_pair = AgentPair(a0, a1)
     start_state = OvercookedState([P(
         (2, 1), s), P((1, 1), s)], {},
                                   order_list=['onion'])
     env = OvercookedEnv(large_mdp, start_state_fn=lambda: start_state)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair,
                                                   include_final_state=True,
                                                   display=DISPLAY)
     end_state = trajectory[-1][0]
     self.assertEqual(end_state.order_list, [])
예제 #29
0
 def test_two_greedy_human_open_map(self):
     scenario_2_mdp = OvercookedGridworld.from_layout_name(
         'scenario2', start_order_list=['any'], cook_time=5)
     mlp = MediumLevelPlanner.from_pickle_or_compute(
         scenario_2_mdp, NO_COUNTERS_PARAMS, force_compute=force_compute)
     a0 = GreedyHumanModel(mlp)
     a1 = GreedyHumanModel(mlp)
     agent_pair = AgentPair(a0, a1)
     start_state = OvercookedState([P(
         (8, 1), s), P((1, 1), s)], {},
                                   order_list=['onion'])
     env = OvercookedEnv.from_mdp(scenario_2_mdp,
                                  start_state_fn=lambda: start_state,
                                  horizon=100)
     trajectory, time_taken, _, _ = env.run_agents(agent_pair,
                                                   include_final_state=True,
                                                   display=DISPLAY)
     end_state = trajectory[-1][0]
     self.assertEqual(len(end_state.order_list), 0)
예제 #30
0
def evaluate(eval_params, mdp_params, outer_shape, policies, featurize_fns):
    """
    Used to visualize rollouts of trained policies

    eval_params (dict): Contains configurations such as the rollout length, number of games, and whether to display rollouts
    mdp_params (dict): OvercookedMDP compatible configuration used to create environment used for evaluation
    outer_shape (list): a list of 2 item specifying the outer shape of the evaluation layout
    policies (list(rllib.Policy or str(non_ml_agent_name))): Policy instances used to map states to action logits for agents or non ml agent name
    featurize_fns(list(func)): Used to preprocess states for agents defaults to lossless_state_encoding if 'None';
        used only when policy inside policies param with_fns
    """
    assert len(policies) == len(featurize_fns), "featurize_fns needs to have same length as policies"
    evaluator = get_base_ae(mdp_params, {"horizon" : eval_params['ep_length'], "num_mdp":1, "mlam_params": eval_params.get("mlam_params")}, outer_shape)

    agents = []
    # Wrap rllib policies in overcooked agents to be compatible with Evaluator code
    for i, policy, featurize_fn in zip(range(len(policies)), policies, featurize_fns):
        if isinstance(policy, RllibPolicy):
            agent = RlLibAgent(policy, agent_index=i, 
                featurize_fn=featurize_fn or evaluator.env.lossless_state_encoding_mdp)
        else:
            agent = OvercookedMultiAgent.create_non_ml_agent(policy, eval_params["non_ml_agents_params"], evaluator.env)
            agent.set_agent_index(i)
        agents.append(agent)
   
    # Compute rollouts
    if 'store_dir' not in eval_params:
        eval_params['store_dir'] = None
    if 'display_phi' not in eval_params:
        eval_params['display_phi'] = False
    
    results = evaluator.evaluate_agent_pair(AgentPair(*agents),
                                            num_games=eval_params['num_games'],
                                            display=eval_params['display'],
                                            dir=eval_params['store_dir'],
                                            display_phi=eval_params['display_phi'],
                                            native_eval=True)
    return results