def get_agent_infos_for_trajectories(trajectories, agent_idx): """ Returns a dictionary of the form { "[agent_info_0]": [ [episode_values], [], ... ], "[agent_info_1]": [ [], [], ... ], ... } with as keys the keys returned by the agent in it's agent_info dictionary NOTE: deprecated """ agent_infos = [] for traj_idx in range(len(trajectories["ep_lengths"])): ep_infos = trajectories["ep_infos"][traj_idx] traj_agent_infos = [ step_info["agent_infos"][agent_idx] for step_info in ep_infos ] # Append all dictionaries together traj_agent_infos = append_dictionaries(traj_agent_infos) agent_infos.append(traj_agent_infos) # Append all dictionaries together once again agent_infos = append_dictionaries(agent_infos) agent_infos = {k: np.array(v) for k, v in agent_infos.items()} return agent_infos
def get_rollouts(self, agent_pair, num_games, display=False, dir=None, final_state=False, display_phi=False, display_until=np.Inf, metadata_fn=None, metadata_info_fn=None, info=True): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) metadata_fn returns some metadata information computed at the end of each trajectory based on some of the trajectory data. NOTE: this is the standard trajectories format used throughout the codebase """ trajectories = { k:[] for k in self.DEFAULT_TRAJ_KEYS } metadata_fn = (lambda x: {}) if metadata_fn is None else metadata_fn metadata_info_fn = (lambda x: "") if metadata_info_fn is None else metadata_info_fn range_iterator = tqdm.trange(num_games, desc="", leave=True) if info else range(num_games) for i in range_iterator: agent_pair.set_mdp(self.mdp) rollout_info = self.run_agents(agent_pair, display=display, dir=dir, include_final_state=final_state, display_phi=display_phi, display_until=display_until) trajectory, time_taken, tot_rews_sparse, _tot_rews_shaped = rollout_info obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[1], trajectory.T[2], trajectory.T[3], trajectory.T[4] trajectories["ep_states"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) trajectories["metadatas"].append(metadata_fn(rollout_info)) # we do not need to regenerate MDP if we are trying to generate a series of rollouts using the same MDP # Basically, the FALSE here means that we are using the same layout and starting positions # (if regen_mdp == True, resetting will call mdp_gen_fn to generate another layout & starting position) self.reset(regen_mdp=False) agent_pair.reset() if info: mu, se = mean_and_std_err(trajectories["ep_returns"]) description = "Avg rew: {:.2f} (std: {:.2f}, se: {:.2f}); avg len: {:.2f}; ".format( mu, np.std(trajectories["ep_returns"]), se, np.mean(trajectories["ep_lengths"])) description += metadata_info_fn(trajectories["metadatas"]) range_iterator.set_description(description) range_iterator.refresh() # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} # Merging all metadata dictionaries, assumes same keys throughout all trajectories["metadatas"] = append_dictionaries(trajectories["metadatas"]) # TODO: should probably transfer check methods over to Env class from overcooked_ai_py.agents.benchmarking import AgentEvaluator AgentEvaluator.check_trajectories(trajectories) return trajectories