def add_means_and_stds_from_df(data, main_trials, algo_name): """Calculate means and SEs for each layout, and add them to the data dictionary under algo name `algo`""" layouts = [ 'asymmetric_advantages', 'coordination_ring', 'cramped_room', 'random0', 'random3' ] for layout in layouts: layout_trials = main_trials[main_trials['layout_name'] == layout] idx_1_workers = [] idx_0_workers = [] for worker_id in layout_trials['player_0_id'].unique(): if layout_trials[layout_trials['player_0_id'] == worker_id]['player_0_is_human'][0]: idx_0_workers.append(worker_id) for worker_id in layout_trials['player_1_id'].unique(): if layout_trials[layout_trials['player_1_id'] == worker_id]['player_1_is_human'][0]: idx_1_workers.append(worker_id) idx_0_trials = layout_trials[layout_trials['player_0_id'].isin( idx_0_workers)] data[layout][algo_name + "_0"] = mean_and_std_err( idx_0_trials.groupby('player_0_id')['score_total'].mean()) idx_1_trials = layout_trials[layout_trials['plaer_1_id'].isin( idx_1_workers)] data[layout][algo_name + "_1"] = mean_and_std_err( idx_1_trials.groupby('plaer_1_id')['score_total'].mean())
def evaluate_bc_models(bc_model_paths, num_rounds): """ Evaluate BC models passed in over `num_rounds` rounds """ best_bc_models_performance = {} # Evaluate best for layout_name in bc_model_paths['train'].keys(): print(layout_name) best_bc_models_performance[layout_name] = {} eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['train'][layout_name]) best_bc_models_performance[layout_name]["BC_train+BC_train"] = mean_and_std_err(eval_trajs['ep_returns']) eval_trajs = eval_with_benchmarking_from_saved(num_rounds, bc_model_paths['test'][layout_name]) best_bc_models_performance[layout_name]["BC_test+BC_test"] = mean_and_std_err(eval_trajs['ep_returns']) bc_train, bc_params_train = get_bc_agent_from_saved(bc_model_paths['train'][layout_name]) bc_test, bc_params_test = get_bc_agent_from_saved(bc_model_paths['test'][layout_name]) del bc_params_train["data_params"] del bc_params_test["data_params"] assert common_keys_equal(bc_params_train, bc_params_test) ae = AgentEvaluator(mdp_params=bc_params_train["mdp_params"], env_params=bc_params_train["env_params"]) train_and_test = ae.evaluate_agent_pair(AgentPair(bc_train, bc_test), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_0"] = mean_and_std_err(train_and_test['ep_returns']) test_and_train = ae.evaluate_agent_pair(AgentPair(bc_test, bc_train), num_games=num_rounds) best_bc_models_performance[layout_name]["BC_train+BC_test_1"] = mean_and_std_err(test_and_train['ep_returns']) return best_bc_models_performance
def get_rollouts(self, agent_pair, num_games, display=False, dir=None, final_state=False, display_phi=False, display_until=np.Inf, metadata_fn=None, metadata_info_fn=None, info=True): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) metadata_fn returns some metadata information computed at the end of each trajectory based on some of the trajectory data. NOTE: this is the standard trajectories format used throughout the codebase """ trajectories = { k:[] for k in self.DEFAULT_TRAJ_KEYS } metadata_fn = (lambda x: {}) if metadata_fn is None else metadata_fn metadata_info_fn = (lambda x: "") if metadata_info_fn is None else metadata_info_fn range_iterator = tqdm.trange(num_games, desc="", leave=True) if info else range(num_games) for i in range_iterator: agent_pair.set_mdp(self.mdp) rollout_info = self.run_agents(agent_pair, display=display, dir=dir, include_final_state=final_state, display_phi=display_phi, display_until=display_until) trajectory, time_taken, tot_rews_sparse, _tot_rews_shaped = rollout_info obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[1], trajectory.T[2], trajectory.T[3], trajectory.T[4] trajectories["ep_states"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) trajectories["metadatas"].append(metadata_fn(rollout_info)) # we do not need to regenerate MDP if we are trying to generate a series of rollouts using the same MDP # Basically, the FALSE here means that we are using the same layout and starting positions # (if regen_mdp == True, resetting will call mdp_gen_fn to generate another layout & starting position) self.reset(regen_mdp=False) agent_pair.reset() if info: mu, se = mean_and_std_err(trajectories["ep_returns"]) description = "Avg rew: {:.2f} (std: {:.2f}, se: {:.2f}); avg len: {:.2f}; ".format( mu, np.std(trajectories["ep_returns"]), se, np.mean(trajectories["ep_lengths"])) description += metadata_info_fn(trajectories["metadatas"]) range_iterator.set_description(description) range_iterator.refresh() # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} # Merging all metadata dictionaries, assumes same keys throughout all trajectories["metadatas"] = append_dictionaries(trajectories["metadatas"]) # TODO: should probably transfer check methods over to Env class from overcooked_ai_py.agents.benchmarking import AgentEvaluator AgentEvaluator.check_trajectories(trajectories) return trajectories
def get_rollouts(self, agent_pair, num_games, display=False, final_state=False, agent_idx=0, reward_shaping=0.0, display_until=np.Inf, info=True, metadata_fn=lambda x: {}): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Only returns the trajectories for one of the agents (the actions _that_ agent took), namely the one indicated by `agent_idx`. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) metadata_fn returns some metadata information computed at the end of each trajectory based on some of the trajectory data. NOTE: standard trajectories format used throughout the codebase """ trajectories = { # With shape (n_timesteps, game_len), where game_len might vary across games: "ep_observations": [], "ep_actions": [], "ep_rewards": [], # Individual dense (= sparse + shaped * rew_shaping) reward values "ep_dones": [], # Individual done values "ep_infos": [], # With shape (n_episodes, ): "ep_returns": [], # Sum of sparse rewards across each episode "ep_lengths": [], # Lengths of each episode "mdp_params": [], # Custom MDP params to for each episode "env_params": [], # Custom Env params for each episode # Custom metadata key value pairs "metadatas": [ ] # Final data type is a dictionary of similar format to trajectories } range_fn = tqdm.trange if info else range for i in range_fn(num_games): agent_pair.set_mdp(self.mdp) rollout_info = self.run_agents(agent_pair, display=display, include_final_state=final_state, display_until=display_until) trajectory, time_taken, tot_rews_sparse, tot_rews_shaped = rollout_info obs, actions, rews, dones, infos = trajectory.T[0], trajectory.T[ 1], trajectory.T[2], trajectory.T[3], trajectory.T[4] trajectories["ep_observations"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_infos"].append(infos) trajectories["ep_returns"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) trajectories["metadatas"].append(metadata_fn(rollout_info)) self.reset() agent_pair.reset() mu, se = mean_and_std_err(trajectories["ep_returns"]) if info: print( "Avg reward {:.2f} (std: {:.2f}, se: {:.2f}) over {} games of avg length {}" .format(mu, np.std(trajectories["ep_returns"]), se, num_games, np.mean(trajectories["ep_lengths"]))) # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} # Merging all metadata dictionaries, assumes same keys throughout all trajectories["metadatas"] = merge_dictionaries( trajectories["metadatas"]) # TODO: should probably transfer check methods over to Env class from overcooked_ai_py.agents.benchmarking import AgentEvaluator AgentEvaluator.check_trajectories(trajectories) return trajectories
def get_rollouts(self, agent_pair, num_games, display=False, final_state=False, agent_idx=0, reward_shaping=0.0, display_until=np.Inf, info=True): """ Simulate `num_games` number rollouts with the current agent_pair and returns processed trajectories. Only returns the trajectories for one of the agents (the actions _that_ agent took), namely the one indicated by `agent_idx`. Returning excessive information to be able to convert trajectories to any required format (baselines, stable_baselines, etc) NOTE: standard trajectories format used throughout the codebase """ trajectories = { # With shape (n_timesteps, game_len), where game_len might vary across games: "ep_observations": [], "ep_actions": [], "ep_rewards": [], # Individual dense (= sparse + shaped * rew_shaping) reward values "ep_dones": [], # Individual done values # With shape (n_episodes, ): "ep_returns": [], # Sum of dense and sparse rewards across each episode "ep_returns_sparse": [], # Sum of sparse rewards across each episode "ep_lengths": [], # Lengths of each episode "mdp_params": [], # Custom MDP params to for each episode "env_params": [] # Custom Env params for each episode } for _ in tqdm.trange(num_games): agent_pair.set_mdp(self.mdp) trajectory, time_taken, tot_rews_sparse, tot_rews_shaped = self.run_agents( agent_pair, display=display, include_final_state=final_state, display_until=display_until) obs, actions, rews, dones = trajectory.T[0], trajectory.T[ 1], trajectory.T[2], trajectory.T[3] trajectories["ep_observations"].append(obs) trajectories["ep_actions"].append(actions) trajectories["ep_rewards"].append(rews) trajectories["ep_dones"].append(dones) trajectories["ep_returns"].append(tot_rews_sparse + tot_rews_shaped * reward_shaping) trajectories["ep_returns_sparse"].append(tot_rews_sparse) trajectories["ep_lengths"].append(time_taken) trajectories["mdp_params"].append(self.mdp.mdp_params) trajectories["env_params"].append(self.env_params) self.reset() agent_pair.reset() mu, se = mean_and_std_err(trajectories["ep_returns"]) if info: print( "Avg reward {:.2f} (std: {:.2f}, se: {:.2f}) over {} games of avg length {}" .format(mu, np.std(trajectories["ep_returns"]), se, num_games, np.mean(trajectories["ep_lengths"]))) # Converting to numpy arrays trajectories = {k: np.array(v) for k, v in trajectories.items()} return trajectories