def get_rollouts(self, agent_pair, num_games, display=False, dir=None, final_state=False, display_phi=False, display_until=np.Inf, metadata_fn=None, metadata_info_fn=None, info=True): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) metadata_fn returns some metadata information computed at the end of each trajectory based on some of the trajectory data. NOTE: this is the standard trajectories format used throughout the codebase """ trajectories = { k:[] for k in self.DEFAULT_TRAJ_KEYS } metadata_fn = (lambda x: {}) if metadata_fn is None else metadata_fn metadata_info_fn = (lambda x: "") if metadata_info_fn is None else metadata_info_fn range_iterator = tqdm.trange(num_games, desc="", leave=True) if info else range(num_games) for i in range_iterator: agent_pair.set_mdp(self.mdp) rollout_info = self.run_agents(agent_pair, display=display, dir=dir, include_final_state=final_state, display_phi=display_phi, display_until=display_until) trajectory, time_taken, tot_rews_sparse, _tot_rews_shaped = rollout_info obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[1], trajectory.T[2], trajectory.T[3], trajectory.T[4] trajectories["ep_states"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) trajectories["metadatas"].append(metadata_fn(rollout_info)) # we do not need to regenerate MDP if we are trying to generate a series of rollouts using the same MDP # Basically, the FALSE here means that we are using the same layout and starting positions # (if regen_mdp == True, resetting will call mdp_gen_fn to generate another layout & starting position) self.reset(regen_mdp=False) agent_pair.reset() if info: mu, se = mean_and_std_err(trajectories["ep_returns"]) description = "Avg rew: {:.2f} (std: {:.2f}, se: {:.2f}); avg len: {:.2f}; ".format( mu, np.std(trajectories["ep_returns"]), se, np.mean(trajectories["ep_lengths"])) description += metadata_info_fn(trajectories["metadatas"]) range_iterator.set_description(description) range_iterator.refresh() # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} # Merging all metadata dictionaries, assumes same keys throughout all trajectories["metadatas"] = append_dictionaries(trajectories["metadatas"]) # TODO: should probably transfer check methods over to Env class from overcooked_ai_py.agents.benchmarking import AgentEvaluator AgentEvaluator.check_trajectories(trajectories) return trajectories
def test_human_model_pair(self): trajs = self.agent_eval.evaluate_human_model_pair() try: AgentEvaluator.check_trajectories(trajs) except AssertionError as e: self.fail("Trajectories were not returned in standard format:\n{}". format(e))
def test_rollouts(self): ap = AgentPair(RandomAgent(), RandomAgent()) trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5) try: AgentEvaluator.check_trajectories(trajs) except AssertionError as e: self.fail("Trajectories were not returned in standard format:\n{}".format(e))
def evaluate_ppo_hm_and_bc(layout, ppo_hm_path, bc_test_path, num_rounds, seeds, best=False, display=False): ppo_hm_performance = defaultdict(lambda: defaultdict(list)) agent_bc_test, bc_params = get_bc_agent_from_saved(bc_test_path) del bc_params["data_params"] del bc_params["mdp_fn_params"] evaluator = AgentEvaluator(**bc_params) for seed in seeds: agent_ppo, _ = get_ppo_agent(ppo_hm_path, seed, best=best) ppo_and_bc = evaluator.evaluate_agent_pair(AgentPair( agent_ppo, agent_bc_test), num_games=num_rounds, display=display) avg_ppo_and_bc = np.mean(ppo_and_bc['ep_returns']) ppo_hm_performance[layout]["PPO_HM+BC_test_0"].append(avg_ppo_and_bc) bc_and_ppo = evaluator.evaluate_agent_pair(AgentPair( agent_bc_test, agent_ppo), num_games=num_rounds, display=display) avg_bc_and_ppo = np.mean(bc_and_ppo['ep_returns']) ppo_hm_performance[layout]["PPO_HM+BC_test_1"].append(avg_bc_and_ppo) return ppo_hm_performance
def test_schelling_s(self): # Schelling failure scenario # # X X S P-D X X # X ↓R X # X X X # O O # X X X # X ↓H X # X X D P-S X X # # The layout is completely symmetric. Both pots need 2 more onions, # and only one delivery is left. The best thing to do would be to split up # towards the different pots, but the agents must somehow coordinate on the # first step. In the H+R case, this doesn't work, but in the R+R it does. # eva = AgentEvaluator( { "layout_name": "schelling_s", "start_order_list": ["any", "any"], "cook_time": 5 }, force_compute=force_compute) start_state = eva.env.mdp.get_standard_start_state() start_state.objects = { (2, 0): Obj('soup', (2, 0), ('onion', 2, 5)), (2, 4): Obj('soup', (2, 4), ('onion', 2, 5)) } eva.start_state = start_state self.compare_times(eva, h_idx=1)
def evaluate_bc_models(bc_model_paths, num_rounds): """ Evaluate BC models passed in over `num_rounds` rounds """ best_bc_models_performance = {} # Evaluate best for layout_name in bc_model_paths['train'].keys(): print(layout_name) best_bc_models_performance[layout_name] = {} eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['train'][layout_name]) best_bc_models_performance[layout_name]["BC_train+BC_train"] = mean_and_std_err(eval_trajs['ep_returns']) eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['test'][layout_name]) best_bc_models_performance[layout_name]["BC_test+BC_test"] = mean_and_std_err(eval_trajs['ep_returns']) bc_train, bc_params_train = get_bc_agent_from_saved(bc_model_paths['train'][layout_name]) bc_test, bc_params_test = get_bc_agent_from_saved(bc_model_paths['test'][layout_name]) del bc_params_train["data_params"] del bc_params_test["data_params"] assert common_keys_equal(bc_params_train, bc_params_test) ae = AgentEvaluator(mdp_params=bc_params_train["mdp_params"], env_params=bc_params_train["env_params"]) train_and_test = ae.evaluate_agent_pair(AgentPair(bc_train, bc_test), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_0"] = mean_and_std_err(train_and_test['ep_returns']) test_and_train = ae.evaluate_agent_pair(AgentPair(bc_test, bc_train), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_1"] = mean_and_std_err(test_and_train['ep_returns']) return best_bc_models_performance
class TestAgentEvaluator(unittest.TestCase): def setUp(self): self.agent_eval = AgentEvaluator({"layout_name": "cramped_room"}, {"horizon": 100}) def test_human_model_pair(self): trajs = self.agent_eval.evaluate_human_model_pair() try: AgentEvaluator.check_trajectories(trajs) except AssertionError as e: self.fail("Trajectories were not returned in standard format:\n{}". format(e)) def test_rollouts(self): ap = AgentPair(RandomAgent(), RandomAgent()) trajs = self.agent_eval.evaluate_agent_pair(ap, num_games=5) try: AgentEvaluator.check_trajectories(trajs) except AssertionError as e: self.fail("Trajectories were not returned in standard format:\n{}". format(e)) def test_mlp_computation(self): try: self.agent_eval.mlp except Exception as e: self.fail("Failed to compute MediumLevelPlanner:\n{}".format(e))
def get_base_ae(mdp_params, env_params, outer_shape=None, mdp_params_schedule_fn=None): """ mdp_params: one set of fixed mdp parameter used by the enviroment env_params: env parameters (horizon, etc) outer_shape: outer shape of the environment mdp_params_schedule_fn: the schedule for varying mdp params return: the base agent evaluator """ assert mdp_params == None or mdp_params_schedule_fn == None, "either of the two has to be null" if type(mdp_params) == dict and "layout_name" in mdp_params: ae = AgentEvaluator.from_layout_name(mdp_params=mdp_params, env_params=env_params) elif 'num_mdp' in env_params: if np.isinf(env_params['num_mdp']): ae = AgentEvaluator.from_mdp_params_infinite( mdp_params=mdp_params, env_params=env_params, outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn) else: ae = AgentEvaluator.from_mdp_params_finite( mdp_params=mdp_params, env_params=env_params, outer_shape=outer_shape, mdp_params_schedule_fn=mdp_params_schedule_fn) else: # should not reach this case raise NotImplementedError() return ae
def get_data(self): if self.save_trajectory: file_path = os.path.join(TRAJECTORIES_DIR, self._create_trajectory_filename()) traj_dict = self._get_trajectory_dict() AgentEvaluator.save_traj_as_json(traj_dict, file_path) self.trajectory = [] return super(OvercookedGame, self).get_data()
def test_mdp_dynamics(self): traj_path = os.path.join(TESTING_DATA_DIR, 'test_mdp_dynamics', 'expected.json') # NOTE: uncomment the following line to recompute trajectories if MDP dymamics were deliberately updated generate_serialized_trajectory(self.base_mdp, traj_path) test_trajectory = AgentEvaluator.load_traj_from_json(traj_path) AgentEvaluator.check_trajectories(test_trajectory, from_json=True)
def eval_with_benchmarking_from_model(n_games, model, bc_params, no_waits, display=False): bc_params = copy.deepcopy(bc_params) a0 = get_bc_agent_from_model(model, bc_params, no_waits) a1 = get_bc_agent_from_model(model, bc_params, no_waits) del bc_params["data_params"], bc_params["mdp_fn_params"] a_eval = AgentEvaluator(**bc_params) ap = AgentPair(a0, a1) trajectories = a_eval.evaluate_agent_pair(ap, num_games=n_games, display=display) return trajectories
def test_embedded_planning_agent(self): agent_evaluator = AgentEvaluator({"layout_name": "cramped_room"}, {"horizon": 100}) other_agent = GreedyHumanModel(agent_evaluator.mlp) epa = EmbeddedPlanningAgent(other_agent, agent_evaluator.mlp, agent_evaluator.env, delivery_horizon=1) ap = AgentPair(epa, other_agent) agent_evaluator.evaluate_agent_pair(ap, num_games=1, display=DISPLAY)
def test_from_mdp_params_variable_across(self): for mdp_gen_params in self.mdp_gen_params_lst: ae0 = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params, env_params={"horizon": 400, "num_mdp": np.inf}, outer_shape=self.outer_shape) ae1 = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params, env_params={"horizon": 400, "num_mdp": np.inf}, outer_shape=self.outer_shape) self.assertFalse(ae0.env.mdp == ae1.env.mdp, "2 randomly generated layouts across 2 evaluators are the same, which is wrong")
def test_common_mdp_jsons(self): traj_test_json_paths = iterate_over_files_in_dir( "../common_tests/trajectory_tests/") for test_json_path in traj_test_json_paths: test_trajectory = AgentEvaluator.load_traj_from_json( test_json_path) try: AgentEvaluator.check_trajectories(test_trajectory) except AssertionError as e: self.fail("File {} failed with error:\n{}".format( test_json_path, e))
def generate_serialized_trajectory(mdp, save_path): # Saving trajectory for dynamics consistency test seed = 0 sparse_reward = 0 while sparse_reward <= 0: np.random.seed(seed) ae = AgentEvaluator.from_mdp(mdp, env_params={"horizon": 1500}) test_trajs = ae.evaluate_random_pair(all_actions=True, num_games=1) sparse_reward = np.mean(test_trajs["ep_returns"]) seed += 1 AgentEvaluator.save_traj_as_json(test_trajs, save_path)
def test_from_mdp_params_variable_finite(self): for mdp_gen_params in self.mdp_gen_params_lst: ae = AgentEvaluator.from_mdp_params_finite( mdp_params=mdp_gen_params, env_params={ "horizon": 400, "num_mdp": 2 }, outer_shape=self.outer_shape) mdp_0 = ae.env.mdp.copy() seen = [mdp_0] for _ in range(20): ae.env.reset(regen_mdp=True) mdp_i = ae.env.mdp if len(seen) == 1: if mdp_i != seen[0]: seen.append(mdp_i.copy()) elif len(seen) == 2: mdp_0, mdp_1 = seen self.assertTrue(( mdp_i == mdp_0 or mdp_i == mdp_1 ), "more than 2 mdp was created, the function failed to perform" ) else: self.assertTrue(False, "theoretically unreachable statement")
def test_from_mdp_lst_biased(self): mdp_lst = [ OvercookedGridworld.from_layout_name(name) for name in self.layout_name_short_lst ] ae = AgentEvaluator.from_mdp_lst(mdp_lst=mdp_lst, env_params={"horizon": 400}, sampling_freq=self.biased) counts = {} for _ in range(self.num_reset): ae.env.reset(regen_mdp=True) if ae.env.mdp.layout_name in counts: counts[ae.env.mdp.layout_name] += 1 else: counts[ae.env.mdp.layout_name] = 1 # construct the ground truth gt = { self.layout_name_short_lst[i]: self.biased[i] for i in range(len(self.layout_name_short_lst)) } for k, v in counts.items(): self.assertAlmostEqual(gt[k], v / self.num_reset, 2, "more than 2 places off for " + k)
def test_scenario_2(self): # Simple asymmetric advantages scenario # # X X X X X O X X X X # S O # D ↑H ↑R X # X X X X X X P=X X X # # Worse version of scenario 3 (probably to be deleted) # # The optimal thing to do for the human is to go and get a dish # so that by the time it gets back to the pot, the soup will be ready. # However, H goes to get the onion, and so does R initially, as it # assumes H will go and get the dish. Once H has picked up the onion, # R realizes that it should go and get the dish itself. This leads to # a couple of timesteps lost (the difference could be made bigger with a # better thought through map) start_state = OvercookedState( [P((5, 2), n), P( (7, 2), n)], {(6, 3): Obj('soup', (6, 3), ('onion', 2, 0))}, order_list=['onion']) mdp_params = {"layout_name": "scenario2", "cook_time": 5} env_params = {"start_state_fn": lambda: start_state} eva = AgentEvaluator(mdp_params, env_params) self.compare_times(eva)
def evaluate_pbt_for_layout(layout_name, num_rounds, pbt_performance, pbt_model_paths, best_test_bc_models, seeds, best=False): bc_agent, bc_params = get_bc_agent_from_saved( model_name=best_test_bc_models[layout_name]) ae = AgentEvaluator(mdp_params=bc_params["mdp_params"], env_params=bc_params["env_params"]) pbt_save_dir = PBT_DATA_DIR + pbt_model_paths[layout_name] + "/" pbt_config = load_dict_from_txt(pbt_save_dir + "config") assert common_keys_equal( bc_params["mdp_params"], pbt_config["mdp_params"] ), "Mdp params differed between PBT and BC models training" assert common_keys_equal( bc_params["env_params"], pbt_config["env_params"] ), "Env params differed between PBT and BC models training" pbt_agents = [ get_pbt_agent_from_config(pbt_save_dir, pbt_config["sim_threads"], seed=seed, agent_idx=0, best=best) for seed in seeds ] eval_pbt_over_seeds(pbt_agents, bc_agent, layout_name, num_rounds, pbt_performance, ae) return pbt_performance
def test_scenario_3_yes_counter(self): # Asymmetric advantage scenario # # X X X X X O X X X X # S X X P X # X ↑H X # D X X X X!X X X # X →R O # X X X X X X X X X X # # This test does not allow only (5. 3) as the only counter mdp_params = {"layout_name": "scenario3"} mdp = OvercookedGridworld.from_layout_name(**mdp_params) start_state = mdp.get_standard_start_state() valid_counters = [(5, 3)] one_counter_params = { 'start_orientations': False, 'wait_allowed': False, 'counter_goals': valid_counters, 'counter_drop': valid_counters, 'counter_pickup': [], 'same_motion_goals': True } env_params = {"start_state_fn": lambda: start_state, "horizon": 1000} eva = AgentEvaluator.from_layout_name(mdp_params, env_params, mlam_params=one_counter_params, force_compute=force_compute) self.repetative_runs(eva)
def test_scenario_4(self): # Yet another asymmetric advantage scenario # # X X X X X O X X X X # S X P=X # D ↑H X # X X X X X X X X X # X X X X X X →R O # X X X X X X X X X X # # Similar to scenario 3, just keeping for reference for now. # In this case we only have human suboptimality, and R # assuming H optimality does not end up to be a problem mdp_params = {"layout_name": "scenario4", "cook_time": 5} mdp = OvercookedGridworld.from_layout_name(**mdp_params) start_state = mdp.get_standard_start_state() start_state.objects = {(8, 1): Obj('soup', (8, 1), ('onion', 2, 5))} start_state.order_list = ['onion'] env_params = {"start_state_fn": lambda: start_state, "horizon": 1000} eva = AgentEvaluator(mdp_params, env_params, force_compute=force_compute) self.compare_times(eva)
def test_trajectory_visualization(self): # we don't have good way to check slider automatically so its mostly test for basic stuff like number of outputted images, if using method raises error etc. traj_path = os.path.join(TESTING_DATA_DIR, 'test_state_visualizer', 'test_trajectory.json') test_trajectory = AgentEvaluator.load_traj_from_json(traj_path) expected_images_num = len(test_trajectory["ep_states"][0]) assert expected_images_num == 10 action_probs = [ [RandomAgent(all_actions=True).action(state)[1]["action_probs"]] * 2 for state in test_trajectory["ep_states"][0] ] result_img_directory_path = StateVisualizer( ).display_rendered_trajectory(test_trajectory, action_probs=action_probs, ipython_display=False) self.assertEqual(get_file_count(result_img_directory_path), expected_images_num) custom_img_directory_path = generate_temporary_file_path( prefix="overcooked_visualized_trajectory", extension="") self.assertNotEqual(custom_img_directory_path, result_img_directory_path) result_img_directory_path = StateVisualizer( ).display_rendered_trajectory( test_trajectory, img_directory_path=custom_img_directory_path, ipython_display=False) self.assertEqual(custom_img_directory_path, result_img_directory_path) self.assertEqual(get_file_count(result_img_directory_path), expected_images_num)
def setUp(self): Recipe.configure({}) trajectory_path = os.path.join(TESTING_DATA_DIR, "test_visualizations", "trajectory.json") events_path = os.path.join(TESTING_DATA_DIR, "test_visualizations", "expected_extracted_events.json") self.trajectory1 = AgentEvaluator.load_traj_from_json(trajectory_path) self.extracted_events1 = load_from_json(events_path)
def test_unidentifiable_s(self): # Same as above, but smaller layout to facilitate DRL training eva = AgentEvaluator( { "layout_name": "asymmetric_advantages", "start_order_list": ["any", "any"], "cook_time": 5 }, force_compute=force_compute) start_state = eva.env.mdp.get_standard_start_state() start_state.objects = { (4, 2): Obj('soup', (4, 2), ('onion', 2, 0)), (4, 3): Obj('soup', (4, 3), ('onion', 3, 5)) } eva.start_state = start_state self.compare_times(eva, h_idx=0)
def test_from_mdp(self): for layout_name in self.layout_name_lst: orignal_mdp = OvercookedGridworld.from_layout_name(layout_name) ae = AgentEvaluator.from_mdp(mdp=orignal_mdp, env_params={"horizon": 400}) ae_mdp = ae.env.mdp self.assertEqual( orignal_mdp, ae_mdp, "mdp with name " + layout_name + " experienced an inconsistency")
def df_traj_to_python_joint_traj(traj_df, complete_traj=True): if len(traj_df) == 0: return None datapoint = traj_df.iloc[0] python_layout_name = JS_LAYOUT_NAME_TO_PYTHON_NAME[datapoint['layout_name']] # python_layout_name = datapoint['layout_name'] agent_evaluator = AgentEvaluator( mdp_params={"layout_name": python_layout_name}, env_params={"horizon": 1250} ) mdp = agent_evaluator.env.mdp env = agent_evaluator.env overcooked_states = [json_state_to_python_state(mdp, s) for s in traj_df.state] overcooked_actions = [json_joint_action_to_python_action(joint_action) for joint_action in traj_df.joint_action] overcooked_rewards = list(traj_df.reward_norm) assert sum(overcooked_rewards) == datapoint.reward_norm_total, "Rewards didn't sum up to cumulative rewards. Probably trajectory df is corrupted / not complete" trajectories = { "ep_observations": [overcooked_states], "ep_actions": [overcooked_actions], "ep_rewards": [overcooked_rewards], # Individual (dense) reward values "ep_dones": [[False] * len(overcooked_states)], # Individual done values "ep_returns": [sum(overcooked_rewards)], # Sum of dense rewards across each episode "ep_returns_sparse": [sum(overcooked_rewards)], # Sum of sparse rewards across each episode "ep_lengths": [len(overcooked_states)], # Lengths of each episode "mdp_params": [mdp.mdp_params], "env_params": [env.env_params] } trajectories = {k: np.array(v) if k != "ep_actions" else v for k, v in trajectories.items() } if complete_traj: agent_evaluator.check_trajectories(trajectories) traj_metadata = { 'worker_id': datapoint['workerid_num'], 'round_num': datapoint['round_num'], 'mdp': agent_evaluator.env.mdp } return trajectories, traj_metadata
def test_from_mdp_params_variable_infinite_specified(self): for mdp_gen_params in self.mdp_gen_params_lst: ae = AgentEvaluator.from_mdp_params_infinite(mdp_params=mdp_gen_params, env_params={"horizon": 400, "num_mdp": np.inf}, outer_shape=self.outer_shape) mdp_0 = ae.env.mdp.copy() for _ in range(5): ae.env.reset(regen_mdp=True) mdp_1 = ae.env.mdp self.assertFalse(mdp_0 == mdp_1, "with infinite layout generator and regen_mdp=True, the 2 layouts should not be the same")
def _get_trajectory_dict(self): trajectories = {k: [] for k in self.env.DEFAULT_TRAJ_KEYS} trajectory = np.array(self.trajectory) obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[ 1], trajectory.T[2], trajectory.T[3], trajectory.T[4] infos[-1] = self.env._add_episode_info(infos[-1]) trajectories["ep_states"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(self.score) trajectories["ep_lengths"].append(self.env.state.timestep) trajectories["mdp_params"].append(self.env.mdp.mdp_params) trajectories["env_params"].append(self.env.env_params) trajectories["metadatas"].append({}) trajectories = {k: np.array(v) for k, v in trajectories.items()} AgentEvaluator.check_trajectories(trajectories) return trajectories
def test_save_load(self): # Train a quick self play agent for 2 iterations ex.run( config_updates={ # Please feel free to modify the parameters below "results_dir": self.temp_results_dir, "experiment_name": "save_load_test", "layout_name": "cramped_room", "num_workers": 1, "train_batch_size": 800, "sgd_minibatch_size": 800, "num_training_iters": 2, "evaluation_interval": 10, "entropy_coeff_start": 0.0, "entropy_coeff_end": 0.0, "use_phi": False, "evaluation_display": False, "verbose": False }, options={'--loglevel': 'ERROR'}) # Kill all ray processes to ensure loading works in a vaccuum ray.shutdown() # Where the agent is stored (this is kind of hardcoded, would like for it to be more easily obtainable) load_path = os.path.join( glob.glob(os.path.join(self.temp_results_dir, "save_load_test*"))[0], 'checkpoint_2', 'checkpoint-2') # Load a dummy state mdp = OvercookedGridworld.from_layout_name("cramped_room") state = mdp.get_standard_start_state() # Ensure simple single-agent loading works agent_0 = load_agent(load_path) agent_0.reset() agent_1 = load_agent(load_path) agent_1.reset() # Ensure forward pass of policy network still works _, _ = agent_0.action(state) _, _ = agent_1.action(state) # Now let's load an agent pair and evaluate it agent_pair = load_agent_pair(load_path) ae = AgentEvaluator.from_layout_name( mdp_params={"layout_name": "cramped_room"}, env_params={"horizon": 400}) # We assume no runtime errors => success, no performance consistency check for now ae.evaluate_agent_pair(agent_pair, 1, info=False)
def df_traj_to_python_joint_traj(traj_df, complete_traj=True): if len(traj_df) == 0: return None datapoint = traj_df.iloc[0] layout_name = datapoint['layout_name'] agent_evaluator = AgentEvaluator.from_layout_name( mdp_params={"layout_name": layout_name}, env_params={ "horizon": 1250 } # Defining the horizon of the mdp of origin of the trajectories ) mdp = agent_evaluator.env.mdp env = agent_evaluator.env overcooked_states = [json_state_to_python_state(s) for s in traj_df.state] overcooked_actions = [ json_joint_action_to_python_action(joint_action) for joint_action in traj_df.joint_action ] overcooked_rewards = list(traj_df.reward) assert sum( overcooked_rewards ) == datapoint.score_total, "Rewards didn't sum up to cumulative rewards. Probably trajectory df is corrupted / not complete" trajectories = { "ep_observations": [overcooked_states], "ep_actions": [overcooked_actions], "ep_rewards": [overcooked_rewards], # Individual (dense) reward values "ep_dones": [[False] * len(overcooked_states)], # Individual done values "ep_infos": [{}] * len(overcooked_states), "ep_returns": [sum(overcooked_rewards)], # Sum of dense rewards across each episode "ep_lengths": [len(overcooked_states)], # Lengths of each episode "mdp_params": [mdp.mdp_params], "env_params": [env.env_params], "metadatas": { 'player_0_id': [datapoint['player_0_id']], 'player_1_id': [datapoint['player_1_id']], 'env': [agent_evaluator.env] } } trajectories = { k: np.array(v) if k not in ["ep_actions", "metadatas"] else v for k, v in trajectories.items() } if complete_traj: agent_evaluator.check_trajectories(trajectories) return trajectories