예제 #1
0
    def __init__(
        self,
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
        model_path: str,
        load: bool = False,
    ):
        """
        Initialized the policy.
        :param seed: Random seed to use for TensorFlow.
        :param brain: The corresponding Brain for this policy.
        :param trainer_settings: The trainer parameters.
        :param model_path: Where to load/save the model.
        :param load: If True, load model from model_path. Otherwise, create new model.
        """

        self.m_size = 0
        self.trainer_settings = trainer_settings
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        # for ghost trainer save/load snapshots
        self.assign_phs: List[tf.Tensor] = []
        self.assign_ops: List[tf.Operation] = []

        self.inference_dict: Dict[str, tf.Tensor] = {}
        self.update_dict: Dict[str, tf.Tensor] = {}
        self.sequence_length = 1
        self.seed = seed
        self.behavior_spec = behavior_spec

        self.act_size = (list(behavior_spec.discrete_action_branches)
                         if behavior_spec.is_action_discrete() else
                         [behavior_spec.action_size])
        self.vec_obs_size = sum(shape[0]
                                for shape in behavior_spec.observation_shapes
                                if len(shape) == 1)
        self.vis_obs_size = sum(1 for shape in behavior_spec.observation_shapes
                                if len(shape) == 3)

        self.use_recurrent = self.network_settings.memory is not None
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.num_branches = self.behavior_spec.action_size
        self.previous_action_dict: Dict[str, np.array] = {}
        self.normalize = self.network_settings.normalize
        self.use_continuous_act = behavior_spec.is_action_continuous()
        self.model_path = model_path
        self.initialize_path = self.trainer_settings.init_path
        self.keep_checkpoints = self.trainer_settings.keep_checkpoints
        self.graph = tf.Graph()
        self.sess = tf.Session(config=tf_utils.generate_session_config(),
                               graph=self.graph)
        self.saver: Optional[tf.Operation] = None
        self.seed = seed
        if self.network_settings.memory is not None:
            self.m_size = self.network_settings.memory.memory_size
            self.sequence_length = self.network_settings.memory.sequence_length
        self._initialize_tensorflow_references()
        self.load = load
예제 #2
0
    def __init__(
        self,
        seed: int,
        behavior_spec: BehaviorSpec,
        trainer_settings: TrainerSettings,
        model_path: str,
        load: bool = False,
        tanh_squash: bool = False,
        reparameterize: bool = False,
        condition_sigma_on_obs: bool = True,
    ):
        self.behavior_spec = behavior_spec
        self.trainer_settings = trainer_settings
        self.network_settings: NetworkSettings = trainer_settings.network_settings
        self.seed = seed
        self.act_size = (
            list(behavior_spec.discrete_action_branches)
            if behavior_spec.is_action_discrete()
            else [behavior_spec.action_size]
        )
        self.vec_obs_size = sum(
            shape[0] for shape in behavior_spec.observation_shapes if len(shape) == 1
        )
        self.vis_obs_size = sum(
            1 for shape in behavior_spec.observation_shapes if len(shape) == 3
        )
        self.model_path = model_path
        self.initialize_path = self.trainer_settings.init_path
        self._keep_checkpoints = self.trainer_settings.keep_checkpoints
        self.use_continuous_act = behavior_spec.is_action_continuous()
        self.num_branches = self.behavior_spec.action_size
        self.previous_action_dict: Dict[str, np.array] = {}
        self.memory_dict: Dict[str, np.ndarray] = {}
        self.normalize = trainer_settings.network_settings.normalize
        self.use_recurrent = self.network_settings.memory is not None
        self.load = load
        self.h_size = self.network_settings.hidden_units
        num_layers = self.network_settings.num_layers
        if num_layers < 1:
            num_layers = 1
        self.num_layers = num_layers

        self.vis_encode_type = self.network_settings.vis_encode_type
        self.tanh_squash = tanh_squash
        self.reparameterize = reparameterize
        self.condition_sigma_on_obs = condition_sigma_on_obs

        self.m_size = 0
        self.sequence_length = 1
        if self.network_settings.memory is not None:
            self.m_size = self.network_settings.memory.memory_size
            self.sequence_length = self.network_settings.memory.sequence_length

        # Non-exposed parameters; these aren't exposed because they don't have a
        # good explanation and usually shouldn't be touched.
        self.log_std_min = -20
        self.log_std_max = 2
예제 #3
0
def test_action_generator():
    # Continuous
    action_len = 30
    specs = BehaviorSpec(
        observation_shapes=[(5, )],
        action_type=ActionType.CONTINUOUS,
        action_shape=action_len,
    )
    zero_action = specs.create_empty_action(4)
    assert np.array_equal(zero_action,
                          np.zeros((4, action_len), dtype=np.float32))
    random_action = specs.create_random_action(4)
    assert random_action.dtype == np.float32
    assert random_action.shape == (4, action_len)
    assert np.min(random_action) >= -1
    assert np.max(random_action) <= 1

    # Discrete
    action_shape = (10, 20, 30)
    specs = BehaviorSpec(
        observation_shapes=[(5, )],
        action_type=ActionType.DISCRETE,
        action_shape=action_shape,
    )
    zero_action = specs.create_empty_action(4)
    assert np.array_equal(zero_action,
                          np.zeros((4, len(action_shape)), dtype=np.int32))

    random_action = specs.create_random_action(4)
    assert random_action.dtype == np.int32
    assert random_action.shape == (4, len(action_shape))
    assert np.min(random_action) >= 0
    for index, branch_size in enumerate(action_shape):
        assert np.max(random_action[:, index]) < branch_size
예제 #4
0
def create_behavior_spec(num_visual, num_vector, vector_size):
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * int(num_visual) + [(vector_size, )] * int(num_vector),
        ActionType.DISCRETE,
        (1, ),
    )
    return behavior_spec
def test_empty_terminal_steps():
    specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )],
                         action_spec=ActionSpec.create_continuous(3))
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
def test_empty_decision_steps():
    specs = BehaviorSpec(observation_shapes=[(3, 2), (5, )],
                         action_spec=ActionSpec.create_continuous(3))
    ds = DecisionSteps.empty(specs)
    assert len(ds.obs) == 2
    assert ds.obs[0].shape == (0, 3, 2)
    assert ds.obs[1].shape == (0, 5)
예제 #7
0
def behavior_spec_from_proto(brain_param_proto: BrainParametersProto,
                             agent_info: AgentInfoProto) -> BehaviorSpec:
    """
    Converts brain parameter and agent info proto to BehaviorSpec object.
    :param brain_param_proto: protobuf object.
    :param agent_info: protobuf object.
    :return: BehaviorSpec object.
    """
    observation_shape = [tuple(obs.shape) for obs in agent_info.observations]
    dim_props = [
        tuple(DimensionProperty(dim) for dim in obs.dimension_properties)
        for obs in agent_info.observations
    ]
    sensor_specs = [
        SensorSpec(obs_shape, dim_p)
        for obs_shape, dim_p in zip(observation_shape, dim_props)
    ]
    # proto from communicator < v1.3 does not set action spec, use deprecated fields instead
    if (brain_param_proto.action_spec.num_continuous_actions == 0
            and brain_param_proto.action_spec.num_discrete_actions == 0):
        if brain_param_proto.vector_action_space_type_deprecated == 1:
            action_spec = ActionSpec(
                brain_param_proto.vector_action_size_deprecated[0], ())
        else:
            action_spec = ActionSpec(
                0, tuple(brain_param_proto.vector_action_size_deprecated))
    else:
        action_spec_proto = brain_param_proto.action_spec
        action_spec = ActionSpec(
            action_spec_proto.num_continuous_actions,
            tuple(branch
                  for branch in action_spec_proto.discrete_branch_sizes),
        )
    return BehaviorSpec(sensor_specs, action_spec)
예제 #8
0
def create_mock_group_spec(
    number_visual_observations=0,
    vector_action_space_type="continuous",
    vector_observation_space_size=3,
    vector_action_space_size=None,
):
    """
    Creates a mock BrainParameters object with parameters.
    """
    # Avoid using mutable object as default param
    act_type = ActionType.DISCRETE
    if vector_action_space_type == "continuous":
        act_type = ActionType.CONTINUOUS
        if vector_action_space_size is None:
            vector_action_space_size = 2
        else:
            vector_action_space_size = vector_action_space_size[0]
    else:
        if vector_action_space_size is None:
            vector_action_space_size = (2, )
        else:
            vector_action_space_size = tuple(vector_action_space_size)
    obs_shapes = [(vector_observation_space_size, )]
    for _ in range(number_visual_observations):
        obs_shapes += [(8, 8, 3)]
    return BehaviorSpec(obs_shapes, act_type, vector_action_space_size)
예제 #9
0
def create_mock_group_spec(
    number_visual_observations=0,
    vector_action_space_type="continuous",
    vector_observation_space_size=3,
    vector_action_space_size=None,
):
    """
    Creates a mock BrainParameters object with parameters.
    """
    # Avoid using mutable object as default param
    if vector_action_space_type == "continuous":
        if vector_action_space_size is None:
            vector_action_space_size = 2
        else:
            vector_action_space_size = vector_action_space_size[0]
        action_spec = ActionSpec.create_continuous(vector_action_space_size)
    else:
        if vector_action_space_size is None:
            vector_action_space_size = (2, )
        else:
            vector_action_space_size = tuple(vector_action_space_size)
        action_spec = ActionSpec.create_discrete(vector_action_space_size)
    obs_shapes = [(vector_observation_space_size, )]
    for _ in range(number_visual_observations):
        obs_shapes += [(8, 8, 3)]
    obs_spec = create_observation_specs_with_shapes(obs_shapes)
    return BehaviorSpec(obs_spec, action_spec)
def test_batched_step_result_from_proto():
    n_agents = 10
    shapes = [(3, ), (4, )]
    spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                        ActionSpec.create_continuous(3))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, spec)
    for agent_id in range(n_agents):
        if agent_id in decision_steps:
            # we set the reward equal to the agent id in generate_list_agent_proto
            assert decision_steps[agent_id].reward == agent_id
        elif agent_id in terminal_steps:
            assert terminal_steps[agent_id].reward == agent_id
        else:
            raise Exception("Missing agent from the steps")
    # We sort the AgentId since they are split between DecisionSteps and TerminalSteps
    combined_agent_id = list(decision_steps.agent_id) + list(
        terminal_steps.agent_id)
    combined_agent_id.sort()
    assert combined_agent_id == list(range(n_agents))
    for agent_id in range(n_agents):
        assert (agent_id in terminal_steps) == (agent_id % 2 == 0)
        if agent_id in terminal_steps:
            assert terminal_steps[agent_id].interrupted == (agent_id % 4 == 0)
    assert decision_steps.obs[0].shape[1] == shapes[0][0]
    assert decision_steps.obs[1].shape[1] == shapes[1][0]
    assert terminal_steps.obs[0].shape[1] == shapes[0][0]
    assert terminal_steps.obs[1].shape[1] == shapes[1][0]
예제 #11
0
def create_agent_buffer(behavior_spec: BehaviorSpec,
                        number: int,
                        reward: float = 0.0) -> AgentBuffer:
    buffer = AgentBuffer()
    curr_observations = [
        np.random.normal(size=shape)
        for shape in behavior_spec.observation_shapes
    ]
    next_observations = [
        np.random.normal(size=shape)
        for shape in behavior_spec.observation_shapes
    ]
    action = behavior_spec.create_random_action(1)[0, :]
    for _ in range(number):
        curr_split_obs = SplitObservations.from_observations(curr_observations)
        next_split_obs = SplitObservations.from_observations(next_observations)
        for i, _ in enumerate(curr_split_obs.visual_observations):
            buffer["visual_obs%d" % i].append(
                curr_split_obs.visual_observations[i])
            buffer["next_visual_obs%d" % i].append(
                next_split_obs.visual_observations[i])
        buffer["vector_obs"].append(curr_split_obs.vector_observations)
        buffer["next_vector_in"].append(next_split_obs.vector_observations)
        buffer["actions"].append(action)
        buffer["done"].append(np.zeros(1, dtype=np.float32))
        buffer["reward"].append(np.ones(1, dtype=np.float32) * reward)
        buffer["masks"].append(np.ones(1, dtype=np.float32))
    return buffer
예제 #12
0
def test_take_action_returns_empty_with_no_agents():
    test_seed = 3
    policy = FakePolicy(test_seed, basic_mock_brain(), basic_params())
    # Doesn't really matter what this is
    dummy_groupspec = BehaviorSpec([(1, )], "continuous", 1)
    no_agent_step = DecisionSteps.empty(dummy_groupspec)
    result = policy.get_action(no_agent_step)
    assert result == ActionInfo.empty()
예제 #13
0
def test_action_masking_continuous():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 10)
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert masks is None
예제 #14
0
def test_empty_terminal_steps():
    specs = BehaviorSpec(
        sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]),
        action_spec=ActionSpec.create_continuous(3),
    )
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
def test_action_masking_continuous():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                                 ActionSpec.create_continuous(10))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert masks is None
예제 #16
0
def test_batched_step_result_from_proto_raises_on_infinite():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionSpec.create_continuous(3))
    ap_list = generate_list_agent_proto(n_agents,
                                        shapes,
                                        infinite_rewards=True)
    with pytest.raises(RuntimeError):
        steps_from_proto(ap_list, behavior_spec)
예제 #17
0
def setup_test_behavior_specs(
    use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8
):
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)],
        ActionType.DISCRETE if use_discrete else ActionType.CONTINUOUS,
        tuple(vector_action_space) if use_discrete else vector_action_space,
    )
    return behavior_spec
예제 #18
0
def create_steps_from_behavior_spec(
        behavior_spec: BehaviorSpec,
        num_agents: int = 1) -> Tuple[DecisionSteps, TerminalSteps]:
    return create_mock_steps(
        num_agents=num_agents,
        observation_shapes=behavior_spec.observation_shapes,
        action_shape=behavior_spec.action_shape,
        discrete=behavior_spec.is_action_discrete(),
    )
예제 #19
0
def test_empty_decision_steps():
    specs = BehaviorSpec(
        sensor_specs=create_sensor_specs_with_shapes([(3, 2), (5, )]),
        action_spec=ActionSpec.create_continuous(3),
    )
    ds = DecisionSteps.empty(specs)
    assert len(ds.obs) == 2
    assert ds.obs[0].shape == (0, 3, 2)
    assert ds.obs[1].shape == (0, 5)
예제 #20
0
def test_batched_step_result_from_proto_raises_on_nan():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionType.CONTINUOUS, 3)
    ap_list = generate_list_agent_proto(n_agents,
                                        shapes,
                                        nan_observations=True)
    with pytest.raises(RuntimeError):
        steps_from_proto(ap_list, behavior_spec)
예제 #21
0
def test_empty_decision_steps():
    specs = BehaviorSpec(
        observation_shapes=[(3, 2), (5, )],
        action_type=ActionType.CONTINUOUS,
        action_shape=3,
    )
    ds = DecisionSteps.empty(specs)
    assert len(ds.obs) == 2
    assert ds.obs[0].shape == (0, 3, 2)
    assert ds.obs[1].shape == (0, 5)
def test_batched_step_result_from_proto_raises_on_nan():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                                 ActionSpec.create_continuous(3))
    ap_list = generate_list_agent_proto(n_agents,
                                        shapes,
                                        nan_observations=True)
    with pytest.raises(RuntimeError):
        steps_from_proto(ap_list, behavior_spec)
예제 #23
0
def test_empty_terminal_steps():
    specs = BehaviorSpec(
        observation_shapes=[(3, 2), (5, )],
        action_type=ActionType.CONTINUOUS,
        action_shape=3,
    )
    ts = TerminalSteps.empty(specs)
    assert len(ts.obs) == 2
    assert ts.obs[0].shape == (0, 3, 2)
    assert ts.obs[1].shape == (0, 5)
예제 #24
0
def create_mock_steps(
    num_agents: int = 1,
    num_vector_observations: int = 0,
    num_vis_observations: int = 0,
    action_shape: List[int] = None,
    discrete: bool = False,
    done: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :int num_vector_observations: Number of "observations" in your observation space
    :int num_vis_observations: Number of "observations" in your observation space
    :int num_vector_acts: Number of actions in your action space
    :bool discrete: Whether or not action space is discrete
    :bool done: Whether all the agents in the batch are done
    """
    if action_shape is None:
        action_shape = [2]

    obs_list = []
    for _ in range(num_vis_observations):
        obs_list.append(np.ones((num_agents, 84, 84, 3), dtype=np.float32))
    if num_vector_observations > 1:
        obs_list.append(
            np.array(num_agents * [num_vector_observations * [1]],
                     dtype=np.float32))
    action_mask = None
    if discrete:
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_shape
        ]

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    behavior_spec = BehaviorSpec(
        [(84, 84, 3)] * num_vis_observations +
        [(num_vector_observations, 0, 0)],
        ActionType.DISCRETE if discrete else ActionType.CONTINUOUS,
        action_shape if discrete else action_shape[0],
    )
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(obs_list, reward, interrupted, agent_id),
        )
    else:
        return (
            DecisionSteps(obs_list, reward, agent_id, action_mask),
            TerminalSteps.empty(behavior_spec),
        )
예제 #25
0
def setup_test_behavior_specs(
    use_discrete=True, use_visual=False, vector_action_space=2, vector_obs_space=8
):
    if use_discrete:
        action_spec = ActionSpec.create_discrete(tuple(vector_action_space))
    else:
        action_spec = ActionSpec.create_continuous(vector_action_space)
    observation_shapes = [(84, 84, 3)] * int(use_visual) + [(vector_obs_space,)]
    obs_spec = create_observation_specs_with_shapes(observation_shapes)
    behavior_spec = BehaviorSpec(obs_spec, action_spec)
    return behavior_spec
예제 #26
0
def test_mismatch_observations_raise_in_step_result_from_proto():
    n_agents = 10
    shapes = [(3, ), (4, )]
    spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                        ActionSpec.create_continuous(3))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    # Hack an observation to be larger, we should get an exception
    ap_list[0].observations[0].shape[0] += 1
    ap_list[0].observations[0].float_data.data.append(0.42)
    with pytest.raises(UnityObservationException):
        steps_from_proto(ap_list, spec)
예제 #27
0
def test_action_masking_discrete_1():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionType.DISCRETE, (10, ))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 1
    assert masks[0].shape == (n_agents / 2, 10)
    assert masks[0][0, 0]
예제 #28
0
def test_action_masking_discrete_2():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(shapes, ActionSpec.create_discrete((2, 2, 6)))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 3
    assert masks[0].shape == (n_agents / 2, 2)
    assert masks[1].shape == (n_agents / 2, 2)
    assert masks[2].shape == (n_agents / 2, 6)
    assert masks[0][0, 0]
예제 #29
0
    def __init__(
            self,
            brain_names,
            step_size=STEP_SIZE,
            num_visual=0,
            num_vector=1,
            num_var_len=0,
            vis_obs_size=VIS_OBS_SIZE,
            vec_obs_size=OBS_SIZE,
            var_len_obs_size=VAR_LEN_SIZE,
            action_sizes=(1, 0),
    ):
        super().__init__()
        self.num_visual = num_visual
        self.num_vector = num_vector
        self.num_var_len = num_var_len
        self.vis_obs_size = vis_obs_size
        self.vec_obs_size = vec_obs_size
        self.var_len_obs_size = var_len_obs_size
        continuous_action_size, discrete_action_size = action_sizes
        discrete_tuple = tuple(2 for _ in range(discrete_action_size))
        action_spec = ActionSpec(continuous_action_size, discrete_tuple)
        self.total_action_size = (continuous_action_size + discrete_action_size
                                  )  # to set the goals/positions
        self.action_spec = action_spec
        self.behavior_spec = BehaviorSpec(self._make_observation_specs(),
                                          action_spec)
        self.action_spec = action_spec
        self.names = brain_names
        self.positions: Dict[str, List[float]] = {}
        self.step_count: Dict[str, float] = {}
        self.random = random.Random(str(self.behavior_spec))
        self.goal: Dict[str, int] = {}
        self.action = {}
        self.rewards: Dict[str, float] = {}
        self.final_rewards: Dict[str, List[float]] = {}
        self.step_result: Dict[str, Tuple[DecisionSteps, TerminalSteps]] = {}
        self.agent_id: Dict[str, int] = {}
        self.step_size = step_size  # defines the difficulty of the test
        # Allow to be used as a UnityEnvironment during tests
        self.academy_capabilities = None

        for name in self.names:
            self.agent_id[name] = 0
            self.goal[name] = self.random.choice([-1, 1])
            self.rewards[name] = 0
            self.final_rewards[name] = []
            self._reset_agent(name)
            self.action[name] = None
            self.step_result[name] = None
def test_action_masking_discrete():
    n_agents = 10
    shapes = [(3, ), (4, )]
    behavior_spec = BehaviorSpec(create_observation_specs_with_shapes(shapes),
                                 ActionSpec.create_discrete((7, 3)))
    ap_list = generate_list_agent_proto(n_agents, shapes)
    decision_steps, terminal_steps = steps_from_proto(ap_list, behavior_spec)
    masks = decision_steps.action_mask
    assert isinstance(masks, list)
    assert len(masks) == 2
    assert masks[0].shape == (n_agents / 2, 7)  # half agents are done
    assert masks[1].shape == (n_agents / 2, 3)  # half agents are done
    assert masks[0][0, 0]
    assert not masks[1][0, 0]
    assert masks[1][0, 1]