def create_mock_steps( num_agents: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, done: bool = False, grouped: bool = False, ) -> Tuple[DecisionSteps, TerminalSteps]: """ Creates a mock Tuple[DecisionSteps, TerminalSteps] with observations. Imitates constant vector/visual observations, rewards, dones, and agents. :int num_agents: Number of "agents" to imitate. :List observation_specs: A List of the observation specs in your steps :int action_spec: ActionSpec for the agent :bool done: Whether all the agents in the batch are done """ obs_list = [] for obs_spec in observation_specs: obs_list.append(np.ones((num_agents,) + obs_spec.shape, dtype=np.float32)) action_mask = None if action_spec.is_discrete(): action_mask = [ np.array(num_agents * [action_size * [False]]) for action_size in action_spec.discrete_branches # type: ignore ] # type: ignore reward = np.array(num_agents * [1.0], dtype=np.float32) interrupted = np.array(num_agents * [False], dtype=np.bool) agent_id = np.arange(num_agents, dtype=np.int32) _gid = 1 if grouped else 0 group_id = np.array(num_agents * [_gid], dtype=np.int32) group_reward = np.array(num_agents * [0.0], dtype=np.float32) behavior_spec = BehaviorSpec(observation_specs, action_spec) if done: return ( DecisionSteps.empty(behavior_spec), TerminalSteps( obs_list, reward, interrupted, agent_id, group_id, group_reward ), ) else: return ( DecisionSteps( obs_list, reward, agent_id, action_mask, group_id, group_reward ), TerminalSteps.empty(behavior_spec), )
def make_fake_trajectory( length: int, observation_specs: List[ObservationSpec], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, num_other_agents_in_group: int = 0, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size for _i in range(length - 1): obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) reward = 1.0 done = False action = ActionTuple( continuous=np.zeros(action_spec.continuous_size, dtype=np.float32), discrete=np.zeros(action_spec.discrete_size, dtype=np.int32), ) action_probs = LogProbsTuple( continuous=np.ones(action_spec.continuous_size, dtype=np.float32), discrete=np.ones(action_spec.discrete_size, dtype=np.float32), ) action_mask = ( [ [False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None ) if action_spec.is_discrete(): prev_action = np.ones(action_size, dtype=np.int32) else: prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" group_status = [] for _ in range(num_other_agents_in_group): group_status.append(AgentStatus(obs, reward, action, done)) experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(experience) obs = [] for obs_spec in observation_specs: obs.append(np.ones(obs_spec.shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, group_status=group_status, group_reward=0, ) steps_list.append(last_experience) return Trajectory( steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs, next_group_obs=[obs] * num_other_agents_in_group, )
def make_fake_trajectory( length: int, observation_shapes: List[Tuple], action_spec: ActionSpec, max_step_complete: bool = False, memory_size: int = 10, ) -> Trajectory: """ Makes a fake trajectory of length length. If max_step_complete, the trajectory is terminated by a max step rather than a done. """ steps_list = [] action_size = action_spec.discrete_size + action_spec.continuous_size action_probs = np.ones( int( np.sum(action_spec.discrete_branches) + action_spec.continuous_size), dtype=np.float32, ) for _i in range(length - 1): obs = [] for _shape in observation_shapes: obs.append(np.ones(_shape, dtype=np.float32)) reward = 1.0 done = False action = np.zeros(action_size, dtype=np.float32) action_pre = np.zeros(action_size, dtype=np.float32) action_mask = ([[False for _ in range(branch)] for branch in action_spec.discrete_branches ] # type: ignore if action_spec.is_discrete() else None) prev_action = np.ones(action_size, dtype=np.float32) max_step = False memory = np.ones(memory_size, dtype=np.float32) agent_id = "test_agent" behavior_id = "test_brain" experience = AgentExperience( obs=obs, reward=reward, done=done, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, interrupted=max_step, memory=memory, ) steps_list.append(experience) obs = [] for _shape in observation_shapes: obs.append(np.ones(_shape, dtype=np.float32)) last_experience = AgentExperience( obs=obs, reward=reward, done=not max_step_complete, action=action, action_probs=action_probs, action_pre=action_pre, action_mask=action_mask, prev_action=prev_action, interrupted=max_step_complete, memory=memory, ) steps_list.append(last_experience) return Trajectory(steps=steps_list, agent_id=agent_id, behavior_id=behavior_id, next_obs=obs)