Exemplo n.º 1
0
def create_mock_steps(
    num_agents: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    done: bool = False,
    grouped: bool = False,
) -> Tuple[DecisionSteps, TerminalSteps]:
    """
    Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations.
    Imitates constant vector/visual observations, rewards, dones, and agents.

    :int num_agents: Number of "agents" to imitate.
    :List observation_specs: A List of the observation specs in your steps
    :int action_spec: ActionSpec for the agent
    :bool done: Whether all the agents in the batch are done
    """
    obs_list = []
    for obs_spec in observation_specs:
        obs_list.append(np.ones((num_agents,) + obs_spec.shape, dtype=np.float32))
    action_mask = None
    if action_spec.is_discrete():
        action_mask = [
            np.array(num_agents * [action_size * [False]])
            for action_size in action_spec.discrete_branches  # type: ignore
        ]  # type: ignore

    reward = np.array(num_agents * [1.0], dtype=np.float32)
    interrupted = np.array(num_agents * [False], dtype=np.bool)
    agent_id = np.arange(num_agents, dtype=np.int32)
    _gid = 1 if grouped else 0
    group_id = np.array(num_agents * [_gid], dtype=np.int32)
    group_reward = np.array(num_agents * [0.0], dtype=np.float32)
    behavior_spec = BehaviorSpec(observation_specs, action_spec)
    if done:
        return (
            DecisionSteps.empty(behavior_spec),
            TerminalSteps(
                obs_list, reward, interrupted, agent_id, group_id, group_reward
            ),
        )
    else:
        return (
            DecisionSteps(
                obs_list, reward, agent_id, action_mask, group_id, group_reward
            ),
            TerminalSteps.empty(behavior_spec),
        )
Exemplo n.º 2
0
def make_fake_trajectory(
    length: int,
    observation_specs: List[ObservationSpec],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
    num_other_agents_in_group: int = 0,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    for _i in range(length - 1):
        obs = []
        for obs_spec in observation_specs:
            obs.append(np.ones(obs_spec.shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = ActionTuple(
            continuous=np.zeros(action_spec.continuous_size, dtype=np.float32),
            discrete=np.zeros(action_spec.discrete_size, dtype=np.int32),
        )
        action_probs = LogProbsTuple(
            continuous=np.ones(action_spec.continuous_size, dtype=np.float32),
            discrete=np.ones(action_spec.discrete_size, dtype=np.float32),
        )
        action_mask = (
            [
                [False for _ in range(branch)]
                for branch in action_spec.discrete_branches
            ]  # type: ignore
            if action_spec.is_discrete()
            else None
        )
        if action_spec.is_discrete():
            prev_action = np.ones(action_size, dtype=np.int32)
        else:
            prev_action = np.ones(action_size, dtype=np.float32)

        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        group_status = []
        for _ in range(num_other_agents_in_group):
            group_status.append(AgentStatus(obs, reward, action, done))
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
            group_status=group_status,
            group_reward=0,
        )
        steps_list.append(experience)
    obs = []
    for obs_spec in observation_specs:
        obs.append(np.ones(obs_spec.shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
        group_status=group_status,
        group_reward=0,
    )
    steps_list.append(last_experience)
    return Trajectory(
        steps=steps_list,
        agent_id=agent_id,
        behavior_id=behavior_id,
        next_obs=obs,
        next_group_obs=[obs] * num_other_agents_in_group,
    )
Exemplo n.º 3
0
def make_fake_trajectory(
    length: int,
    observation_shapes: List[Tuple],
    action_spec: ActionSpec,
    max_step_complete: bool = False,
    memory_size: int = 10,
) -> Trajectory:
    """
    Makes a fake trajectory of length length. If max_step_complete,
    the trajectory is terminated by a max step rather than a done.
    """
    steps_list = []

    action_size = action_spec.discrete_size + action_spec.continuous_size
    action_probs = np.ones(
        int(
            np.sum(action_spec.discrete_branches) +
            action_spec.continuous_size),
        dtype=np.float32,
    )
    for _i in range(length - 1):
        obs = []
        for _shape in observation_shapes:
            obs.append(np.ones(_shape, dtype=np.float32))
        reward = 1.0
        done = False
        action = np.zeros(action_size, dtype=np.float32)
        action_pre = np.zeros(action_size, dtype=np.float32)
        action_mask = ([[False for _ in range(branch)]
                        for branch in action_spec.discrete_branches
                        ]  # type: ignore
                       if action_spec.is_discrete() else None)
        prev_action = np.ones(action_size, dtype=np.float32)
        max_step = False
        memory = np.ones(memory_size, dtype=np.float32)
        agent_id = "test_agent"
        behavior_id = "test_brain"
        experience = AgentExperience(
            obs=obs,
            reward=reward,
            done=done,
            action=action,
            action_probs=action_probs,
            action_pre=action_pre,
            action_mask=action_mask,
            prev_action=prev_action,
            interrupted=max_step,
            memory=memory,
        )
        steps_list.append(experience)
    obs = []
    for _shape in observation_shapes:
        obs.append(np.ones(_shape, dtype=np.float32))
    last_experience = AgentExperience(
        obs=obs,
        reward=reward,
        done=not max_step_complete,
        action=action,
        action_probs=action_probs,
        action_pre=action_pre,
        action_mask=action_mask,
        prev_action=prev_action,
        interrupted=max_step_complete,
        memory=memory,
    )
    steps_list.append(last_experience)
    return Trajectory(steps=steps_list,
                      agent_id=agent_id,
                      behavior_id=behavior_id,
                      next_obs=obs)