def test_get_encoding_function(self): mdp = OvercookedGridworld.from_layout_name("cramped_room") mdp_params = mdp.mdp_params env_params = {"horizon": 100} env = OvercookedEnv.from_mdp(mdp, **env_params) state = mdp.get_standard_start_state() example_encoding_fns_names = ["mdp.multi_hot_orders_encoding", "env.featurize_state_mdp", "env.lossless_state_encoding_mdp"] example_encoding_fns = [mdp.multi_hot_orders_encoding, env.featurize_state_mdp, env.lossless_state_encoding_mdp] for encoding_fn_name, encoding_fn in zip(example_encoding_fns_names, example_encoding_fns): encoding_fn_from_name = get_encoding_function(encoding_fn_name, env=env) self.assertEqual(encoding_fn_from_name, encoding_fn) if encoding_fn_name.split(".")[0] == "mdp": encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp=mdp) self.assertEqual(encoding_fn_from_name, encoding_fn) encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp_params=mdp_params) # compare names as new instance of mdp is created self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__) else: encoding_fn_from_name = get_encoding_function(encoding_fn_name, env_params=env_params, mdp_params=mdp_params) # compare names as new instance of env is created self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__) expected_encoded_state_dict = {str(i): fn(state) for i, fn in enumerate(example_encoding_fns)} actual_encoded_state_dict = get_encoding_function({str(i): fn_name for i, fn_name in enumerate(example_encoding_fns_names)}, env=env)(state) self.assertEqual(expected_encoded_state_dict.keys(), actual_encoded_state_dict.keys()) for k in expected_encoded_state_dict.keys(): self.assertTrue(np.array_equal(expected_encoded_state_dict[k], actual_encoded_state_dict[k]))
def _evaluate(trainer, evaluation_workers): print("Computing rollout of current trained policy") assert len(eval_params["agents"]) == 2, "currently only evaluation for 2 agents is supported" # Randomize starting indices policies_names = copy.deepcopy(eval_params["agents"]) if shuffle: np.random.shuffle(policies_names) # # Get the corresponding rllib policy objects for each policy string name or create Overcooked Agent object policies = [] base_ae = get_base_ae(eval_mdp_params, env_params, outer_shape) base_env = base_ae.env for policy_name in policies_names: if OvercookedMultiAgent.is_ml_agent(policy_name): policies.append(trainer.get_policy(policy_name)) else: policies.append(policy_name) ppo_featurization = get_encoding_function(featurize_fns_map["ppo"], env=base_env) featurize_fns = [ppo_featurization] * len(policies) if 'bc' in policies_names: bc_featurization = get_encoding_function(featurize_fns_map["bc"], env=base_env) for i, policy_name in enumerate(policies_names): if policy_name == 'bc': featurize_fns[i] = bc_featurization # Compute the evaluation rollout. Note this doesn't use the rllib passed in evaluation_workers, so this # computation all happens on the CPU. Could change this if evaluation becomes a bottleneck results = evaluate(eval_params, eval_mdp_params, outer_shape, policies, featurize_fns) # Log any metrics we care about for rllib tensorboard visualization metrics = {} metrics['average_sparse_reward'] = np.mean(results['ep_returns']) return metrics
def _setup_featurize_fns(self, featurize_fns=None): if featurize_fns is None: featurize_fns = self.featurize_fns featurize_fns = copy.deepcopy(featurize_fns) for agent_name, fn in OvercookedMultiAgent.default_featurize_fns.items(): if agent_name not in featurize_fns: featurize_fns[agent_name] = fn self.featurize_fns = {agent_name: get_encoding_function(fn, env=self.base_env) for agent_name, fn in featurize_fns.items()}
def _get_observation_shape(bc_params): """ Helper function for creating a dummy environment from "mdp_params" and "env_params" specified in bc_params and returning the shape of the observation space """ base_ae = _get_base_ae(bc_params) base_env = base_ae.env dummy_state = base_env.mdp.get_standard_start_state() encoding_f = get_encoding_function( bc_params["data_params"]["state_processing_function"], env=base_env) obs_shape = encoding_f(dummy_state)[0].shape return obs_shape
def _get_orders_shape(bc_params): """ Helper function for creating a dummy environment from "mdp_params" and "env_params" specified in bc_params and returning the shape of the order space NOTE: does work when output logit layer shape is same as orders shape (does not work for sparse encodings) """ base_ae = _get_base_ae(bc_params) base_env = base_ae.env dummy_state = base_env.mdp.get_standard_start_state() encoding_f = get_encoding_function( bc_params["data_params"]["orders_processing_function"], env=base_env) orders_shape = encoding_f(dummy_state)[0].shape return orders_shape
def evaluate_bc_model(model, bc_params): """ Creates an AgentPair object containing two instances of BC Agents, whose policies are specified by `model`. Runs a rollout using AgentEvaluator class in an environment specified by bc_params Arguments - model (tf.keras.Model) A function that maps featurized overcooked states to action logits - bc_params (dict) Specifies the environment in which to evaluate the agent (i.e. layout, reward_shaping_param) as well as the configuration for the rollout (rollout_length) Returns - reward (int) Total sparse reward achieved by AgentPair during rollout """ evaluation_params = bc_params['evaluation_params'] mdp_params = bc_params['mdp_params'] # Get reference to state encoding function used by bc agents, with compatible signature base_ae = _get_base_ae(bc_params) base_env = base_ae.env featurize_fn = get_encoding_function( bc_params["data_params"]["state_processing_function"], env=base_env) # Wrap Keras models in rllib policies policies = [ BehaviorCloningPolicy.from_model(model, bc_params, stochastic=True), BehaviorCloningPolicy.from_model(model, bc_params, stochastic=True) ] featurize_fns = [featurize_fn for p in policies] # Compute the results of the rollout(s) results = evaluate(eval_params=evaluation_params, mdp_params=mdp_params, outer_shape=None, policies=policies, featurize_fns=featurize_fns) # Compute the average sparse return obtained in each rollout reward = np.mean(results['ep_returns']) return reward
def joint_state_trajectory_to_single( trajectories, joint_traj_data, player_indices_to_convert, processed=True, silent=False, include_orders=DEFAULT_BC_PARAMS["predict_orders"], state_processing_function=DEFAULT_DATA_PARAMS["state_processing_function"], action_processing_function=DEFAULT_DATA_PARAMS[ "action_processing_function"], orders_processing_function=DEFAULT_DATA_PARAMS[ "orders_processing_function"]): """ Take a joint trajectory and split it into two single-agent trajectories, adding data to the `trajectories` dictionary player_indices_to_convert: which player indexes' trajs we should return """ env = joint_traj_data['metadatas']['env'][0] assert len(joint_traj_data['ep_observations'] ) == 1, "This method only takes in one trajectory" states, joint_actions = joint_traj_data['ep_observations'][ 0], joint_traj_data['ep_actions'][0] rewards, length = joint_traj_data['ep_rewards'][0], joint_traj_data[ 'ep_lengths'][0] if processed: state_processing_function = get_encoding_function( state_processing_function, env=env) action_processing_function = get_encoding_function( action_processing_function, env=env) orders_processing_function = get_encoding_function( orders_processing_function, env=env) else: # identity functions state_processing_function = lambda state: [state] * (max( player_indices_to_convert) + 1) action_processing_function = lambda action: action orders_processing_function = lambda state: [state.orders_list] * (max( player_indices_to_convert) + 1) # Getting trajectory for each agent for agent_idx in player_indices_to_convert: ep_obs, ep_acts, ep_dones, ep_orders = [], [], [], [] for i in range(len(states)): state, joint_action = states[i], joint_actions[i] action = joint_action[agent_idx] ep_obs.append(state_processing_function(state)[agent_idx]) ep_acts.append(action_processing_function(joint_action)[agent_idx]) ep_dones.append(False) if include_orders: ep_orders.append(orders_processing_function(state)[agent_idx]) ep_dones[-1] = True trajectories["ep_observations"].append(ep_obs) trajectories["ep_actions"].append(ep_acts) trajectories["ep_rewards"].append(rewards) trajectories["ep_dones"].append(ep_dones) trajectories["ep_infos"].append([{}] * len(rewards)) trajectories["ep_returns"].append(sum(rewards)) trajectories["ep_lengths"].append(length) trajectories["mdp_params"].append(env.mdp.mdp_params) trajectories["env_params"].append({}) trajectories["metadatas"]["ep_agent_idxs"].append(agent_idx) if include_orders: trajectories["ep_orders"].append(ep_orders)
def process_trajs_from_json_obj( trajectories, processed, agent_idxs, include_orders=DEFAULT_BC_PARAMS["predict_orders"], state_processing_function=DEFAULT_DATA_PARAMS["state_processing_function"], action_processing_function=DEFAULT_DATA_PARAMS[ "action_processing_function"], orders_processing_function=DEFAULT_DATA_PARAMS[ "orders_processing_function"]): if processed: state_processing_function = get_encoding_function( state_processing_function, mdp_params=trajectories["mdp_params"][0], env_params=trajectories["env_params"][0]) action_processing_function = get_encoding_function( action_processing_function, mdp_params=trajectories["mdp_params"][0], env_params=trajectories["env_params"][0]) orders_processing_function = get_encoding_function( orders_processing_function, mdp_params=trajectories["mdp_params"][0], env_params=trajectories["env_params"][0]) else: # identity functions state_processing_function = lambda state: [state] * (max(agent_idxs) + 1) action_processing_function = lambda action: action orders_processing_function = lambda state: [state.orders_list] * (max( agent_idxs) + 1) all_observations = [] all_actions = [] all_rewards = [] all_orders = [] for states, actions, rewards in zip(trajectories["ep_states"], trajectories["ep_actions"], trajectories["ep_rewards"]): for agent_idx in agent_idxs: single_agent_episode_observations = [] single_agent_episode_actions = [] single_agent_episode_rewards = [] single_agent_episode_orders = [] for state, action, reward in zip(states, actions, rewards): single_agent_episode_observations.append( state_processing_function(state)[agent_idx]) single_agent_episode_actions.append( action_processing_function(action)[agent_idx]) single_agent_episode_rewards.append([reward]) if include_orders: single_agent_episode_orders.append( orders_processing_function(state)[agent_idx]) all_observations.append(single_agent_episode_observations) all_actions.append(single_agent_episode_actions) all_rewards.append(single_agent_episode_rewards) if include_orders: all_orders.append(single_agent_episode_orders) if not trajectories.get("metadatas"): trajectories["metadatas"] = {} trajectories["metadatas"]["ep_agent_idxs"] = agent_idxs trajectories["ep_observations"] = all_observations trajectories["ep_actions"] = all_actions trajectories["ep_rewards"] = all_rewards if include_orders: trajectories["ep_orders"] = all_orders return trajectories, agent_idxs