示例#1
0
 def test_get_encoding_function(self):
     mdp = OvercookedGridworld.from_layout_name("cramped_room")
     mdp_params = mdp.mdp_params
     env_params = {"horizon": 100}
     env = OvercookedEnv.from_mdp(mdp, **env_params)
     state = mdp.get_standard_start_state()
     example_encoding_fns_names = ["mdp.multi_hot_orders_encoding", "env.featurize_state_mdp", "env.lossless_state_encoding_mdp"]
     example_encoding_fns = [mdp.multi_hot_orders_encoding, env.featurize_state_mdp, env.lossless_state_encoding_mdp]
     for encoding_fn_name, encoding_fn in zip(example_encoding_fns_names, example_encoding_fns):
         encoding_fn_from_name = get_encoding_function(encoding_fn_name, env=env)
         self.assertEqual(encoding_fn_from_name, encoding_fn)
         if encoding_fn_name.split(".")[0] == "mdp":
             encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp=mdp)
             self.assertEqual(encoding_fn_from_name, encoding_fn)
             encoding_fn_from_name = get_encoding_function(encoding_fn_name, mdp_params=mdp_params)
             # compare names as new instance of mdp is created
             self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__)
         else:
             encoding_fn_from_name = get_encoding_function(encoding_fn_name, env_params=env_params, mdp_params=mdp_params)
             # compare names as new instance of env is created
             self.assertEqual(encoding_fn_from_name.__name__, encoding_fn.__name__) 
     
     expected_encoded_state_dict = {str(i): fn(state) for i, fn in enumerate(example_encoding_fns)}
     actual_encoded_state_dict = get_encoding_function({str(i): fn_name for i, fn_name in enumerate(example_encoding_fns_names)}, env=env)(state)
     self.assertEqual(expected_encoded_state_dict.keys(), actual_encoded_state_dict.keys())
     for k in expected_encoded_state_dict.keys():
         self.assertTrue(np.array_equal(expected_encoded_state_dict[k], actual_encoded_state_dict[k]))
示例#2
0
    def _evaluate(trainer, evaluation_workers):
        print("Computing rollout of current trained policy")
        assert len(eval_params["agents"]) == 2, "currently only evaluation for 2 agents is supported"
        # Randomize starting indices

        policies_names = copy.deepcopy(eval_params["agents"])
        if shuffle:
            np.random.shuffle(policies_names)
        # # Get the corresponding rllib policy objects for each policy string name or create Overcooked Agent object
        policies = []
        base_ae = get_base_ae(eval_mdp_params, env_params, outer_shape)
        base_env = base_ae.env
        for policy_name in policies_names:
            if OvercookedMultiAgent.is_ml_agent(policy_name):
                policies.append(trainer.get_policy(policy_name))
            else:
                policies.append(policy_name)
        ppo_featurization = get_encoding_function(featurize_fns_map["ppo"], env=base_env)
        featurize_fns = [ppo_featurization] * len(policies)
        if 'bc' in policies_names:
            bc_featurization = get_encoding_function(featurize_fns_map["bc"], env=base_env)
            for i, policy_name in enumerate(policies_names):
                if policy_name == 'bc':
                    featurize_fns[i] = bc_featurization
        
        # Compute the evaluation rollout. Note this doesn't use the rllib passed in evaluation_workers, so this 
        # computation all happens on the CPU. Could change this if evaluation becomes a bottleneck
        results = evaluate(eval_params, eval_mdp_params, outer_shape, policies, featurize_fns)

        # Log any metrics we care about for rllib tensorboard visualization
        metrics = {}
        metrics['average_sparse_reward'] = np.mean(results['ep_returns'])
        return metrics
示例#3
0
 def _setup_featurize_fns(self, featurize_fns=None):
     if featurize_fns is None:
         featurize_fns = self.featurize_fns
     featurize_fns = copy.deepcopy(featurize_fns)
     for agent_name, fn in OvercookedMultiAgent.default_featurize_fns.items():
         if agent_name not in featurize_fns:
             featurize_fns[agent_name] = fn
     self.featurize_fns = {agent_name: get_encoding_function(fn, env=self.base_env)
         for agent_name, fn in featurize_fns.items()}
def _get_observation_shape(bc_params):
    """
    Helper function for creating a dummy environment from "mdp_params" and "env_params" specified
    in bc_params and returning the shape of the observation space
    """
    base_ae = _get_base_ae(bc_params)
    base_env = base_ae.env
    dummy_state = base_env.mdp.get_standard_start_state()
    encoding_f = get_encoding_function(
        bc_params["data_params"]["state_processing_function"], env=base_env)
    obs_shape = encoding_f(dummy_state)[0].shape
    return obs_shape
def _get_orders_shape(bc_params):
    """
    Helper function for creating a dummy environment from "mdp_params" and "env_params" specified
    in bc_params and returning the shape of the order space
    NOTE: does work when output logit layer shape is same as orders shape (does not work for sparse encodings)
    """
    base_ae = _get_base_ae(bc_params)
    base_env = base_ae.env
    dummy_state = base_env.mdp.get_standard_start_state()

    encoding_f = get_encoding_function(
        bc_params["data_params"]["orders_processing_function"], env=base_env)
    orders_shape = encoding_f(dummy_state)[0].shape
    return orders_shape
def evaluate_bc_model(model, bc_params):
    """
    Creates an AgentPair object containing two instances of BC Agents, whose policies are specified by `model`. Runs
    a rollout using AgentEvaluator class in an environment specified by bc_params

    Arguments

        - model (tf.keras.Model)        A function that maps featurized overcooked states to action logits
        - bc_params (dict)              Specifies the environment in which to evaluate the agent (i.e. layout, reward_shaping_param)
                                            as well as the configuration for the rollout (rollout_length)

    Returns

        - reward (int)                  Total sparse reward achieved by AgentPair during rollout
    """
    evaluation_params = bc_params['evaluation_params']
    mdp_params = bc_params['mdp_params']

    # Get reference to state encoding function used by bc agents, with compatible signature
    base_ae = _get_base_ae(bc_params)
    base_env = base_ae.env
    featurize_fn = get_encoding_function(
        bc_params["data_params"]["state_processing_function"], env=base_env)

    # Wrap Keras models in rllib policies
    policies = [
        BehaviorCloningPolicy.from_model(model, bc_params, stochastic=True),
        BehaviorCloningPolicy.from_model(model, bc_params, stochastic=True)
    ]
    featurize_fns = [featurize_fn for p in policies]
    # Compute the results of the rollout(s)
    results = evaluate(eval_params=evaluation_params,
                       mdp_params=mdp_params,
                       outer_shape=None,
                       policies=policies,
                       featurize_fns=featurize_fns)

    # Compute the average sparse return obtained in each rollout
    reward = np.mean(results['ep_returns'])
    return reward
示例#7
0
def joint_state_trajectory_to_single(
    trajectories,
    joint_traj_data,
    player_indices_to_convert,
    processed=True,
    silent=False,
    include_orders=DEFAULT_BC_PARAMS["predict_orders"],
    state_processing_function=DEFAULT_DATA_PARAMS["state_processing_function"],
    action_processing_function=DEFAULT_DATA_PARAMS[
        "action_processing_function"],
    orders_processing_function=DEFAULT_DATA_PARAMS[
        "orders_processing_function"]):
    """
    Take a joint trajectory and split it into two single-agent trajectories, adding data to the `trajectories` dictionary
    player_indices_to_convert: which player indexes' trajs we should return
    """
    env = joint_traj_data['metadatas']['env'][0]

    assert len(joint_traj_data['ep_observations']
               ) == 1, "This method only takes in one trajectory"
    states, joint_actions = joint_traj_data['ep_observations'][
        0], joint_traj_data['ep_actions'][0]

    rewards, length = joint_traj_data['ep_rewards'][0], joint_traj_data[
        'ep_lengths'][0]

    if processed:
        state_processing_function = get_encoding_function(
            state_processing_function, env=env)
        action_processing_function = get_encoding_function(
            action_processing_function, env=env)
        orders_processing_function = get_encoding_function(
            orders_processing_function, env=env)
    else:
        # identity functions
        state_processing_function = lambda state: [state] * (max(
            player_indices_to_convert) + 1)
        action_processing_function = lambda action: action
        orders_processing_function = lambda state: [state.orders_list] * (max(
            player_indices_to_convert) + 1)

    # Getting trajectory for each agent
    for agent_idx in player_indices_to_convert:
        ep_obs, ep_acts, ep_dones, ep_orders = [], [], [], []
        for i in range(len(states)):
            state, joint_action = states[i], joint_actions[i]

            action = joint_action[agent_idx]
            ep_obs.append(state_processing_function(state)[agent_idx])
            ep_acts.append(action_processing_function(joint_action)[agent_idx])
            ep_dones.append(False)
            if include_orders:
                ep_orders.append(orders_processing_function(state)[agent_idx])

        ep_dones[-1] = True

        trajectories["ep_observations"].append(ep_obs)
        trajectories["ep_actions"].append(ep_acts)
        trajectories["ep_rewards"].append(rewards)
        trajectories["ep_dones"].append(ep_dones)
        trajectories["ep_infos"].append([{}] * len(rewards))
        trajectories["ep_returns"].append(sum(rewards))
        trajectories["ep_lengths"].append(length)
        trajectories["mdp_params"].append(env.mdp.mdp_params)
        trajectories["env_params"].append({})
        trajectories["metadatas"]["ep_agent_idxs"].append(agent_idx)
        if include_orders: trajectories["ep_orders"].append(ep_orders)
示例#8
0
def process_trajs_from_json_obj(
    trajectories,
    processed,
    agent_idxs,
    include_orders=DEFAULT_BC_PARAMS["predict_orders"],
    state_processing_function=DEFAULT_DATA_PARAMS["state_processing_function"],
    action_processing_function=DEFAULT_DATA_PARAMS[
        "action_processing_function"],
    orders_processing_function=DEFAULT_DATA_PARAMS[
        "orders_processing_function"]):
    if processed:
        state_processing_function = get_encoding_function(
            state_processing_function,
            mdp_params=trajectories["mdp_params"][0],
            env_params=trajectories["env_params"][0])
        action_processing_function = get_encoding_function(
            action_processing_function,
            mdp_params=trajectories["mdp_params"][0],
            env_params=trajectories["env_params"][0])
        orders_processing_function = get_encoding_function(
            orders_processing_function,
            mdp_params=trajectories["mdp_params"][0],
            env_params=trajectories["env_params"][0])
    else:
        # identity functions
        state_processing_function = lambda state: [state] * (max(agent_idxs) +
                                                             1)
        action_processing_function = lambda action: action
        orders_processing_function = lambda state: [state.orders_list] * (max(
            agent_idxs) + 1)

    all_observations = []
    all_actions = []
    all_rewards = []
    all_orders = []
    for states, actions, rewards in zip(trajectories["ep_states"],
                                        trajectories["ep_actions"],
                                        trajectories["ep_rewards"]):
        for agent_idx in agent_idxs:
            single_agent_episode_observations = []
            single_agent_episode_actions = []
            single_agent_episode_rewards = []
            single_agent_episode_orders = []
            for state, action, reward in zip(states, actions, rewards):
                single_agent_episode_observations.append(
                    state_processing_function(state)[agent_idx])
                single_agent_episode_actions.append(
                    action_processing_function(action)[agent_idx])
                single_agent_episode_rewards.append([reward])
                if include_orders:
                    single_agent_episode_orders.append(
                        orders_processing_function(state)[agent_idx])
            all_observations.append(single_agent_episode_observations)
            all_actions.append(single_agent_episode_actions)
            all_rewards.append(single_agent_episode_rewards)
            if include_orders: all_orders.append(single_agent_episode_orders)

    if not trajectories.get("metadatas"):
        trajectories["metadatas"] = {}
    trajectories["metadatas"]["ep_agent_idxs"] = agent_idxs

    trajectories["ep_observations"] = all_observations
    trajectories["ep_actions"] = all_actions
    trajectories["ep_rewards"] = all_rewards
    if include_orders: trajectories["ep_orders"] = all_orders

    return trajectories, agent_idxs