def get_agent_infos_for_trajectories(trajectories, agent_idx): """ Returns a dictionary of the form { "[agent_info_0]": [ [episode_values], [], ... ], "[agent_info_1]": [ [], [], ... ], ... } with as keys the keys returned by the agent in it's agent_info dictionary """ agent_infos = [] for traj_idx in range(len(trajectories["ep_lengths"])): ep_infos = trajectories["ep_infos"][traj_idx] traj_agent_infos = [ step_info["agent_infos"][agent_idx] for step_info in ep_infos ] # Append all dictionaries together traj_agent_infos = merge_dictionaries(traj_agent_infos) agent_infos.append(traj_agent_infos) # Append all dictionaries together once again agent_infos = merge_dictionaries(agent_infos) agent_infos = {k: np.array(v) for k, v in agent_infos.items()} return agent_infos
def merge_trajs(trajs_n): """ Takes in multiple trajectory objects and appends all the information into one trajectory object [trajs0, trajs1] -> trajs """ metadatas_merged = merge_dictionaries( [trajs["metadatas"] for trajs in trajs_n]) merged_trajs = merge_dictionaries(trajs_n) merged_trajs["metadatas"] = metadatas_merged return merged_trajs
def get_rollouts(self, agent_pair, num_games, display=False, final_state=False, agent_idx=0, reward_shaping=0.0, display_until=np.Inf, info=True, metadata_fn=lambda x: {}): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Only returns the trajectories for one of the agents (the actions _that_ agent took), namely the one indicated by `agent_idx`. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) metadata_fn returns some metadata information computed at the end of each trajectory based on some of the trajectory data. NOTE: standard trajectories format used throughout the codebase """ trajectories = { # With shape (n_timesteps, game_len), where game_len might vary across games: "ep_observations": [], "ep_actions": [], "ep_rewards": [], # Individual dense (= sparse + shaped * rew_shaping) reward values "ep_dones": [], # Individual done values "ep_infos": [], # With shape (n_episodes, ): "ep_returns": [], # Sum of sparse rewards across each episode "ep_lengths": [], # Lengths of each episode "mdp_params": [], # Custom MDP params to for each episode "env_params": [], # Custom Env params for each episode # Custom metadata key value pairs "metadatas": [ ] # Final data type is a dictionary of similar format to trajectories } range_fn = tqdm.trange if info else range for i in range_fn(num_games): agent_pair.set_mdp(self.mdp) rollout_info = self.run_agents(agent_pair, display=display, include_final_state=final_state, display_until=display_until) trajectory, time_taken, tot_rews_sparse, tot_rews_shaped = rollout_info obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[ 1], trajectory.T[2], trajectory.T[3], trajectory.T[4] trajectories["ep_observations"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) trajectories["metadatas"].append(metadata_fn(rollout_info)) self.reset() agent_pair.reset() mu, se = mean_and_std_err(trajectories["ep_returns"]) if info: print( "Avg reward {:.2f} (std: {:.2f}, se: {:.2f}) over {} games of avg length {}" .format(mu, np.std(trajectories["ep_returns"]), se, num_games, np.mean(trajectories["ep_lengths"]))) # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} # Merging all metadata dictionaries, assumes same keys throughout all trajectories["metadatas"] = merge_dictionaries( trajectories["metadatas"]) # TODO: should probably transfer check methods over to Env class from overcooked_ai_py.agents.benchmarking import AgentEvaluator AgentEvaluator.check_trajectories(trajectories) return trajectories