def get_expected_parallel_timesteps_1() -> TimeStep: return TimeStep( step_type=StepType.FIRST, reward={ "agent_0": 0.0, "agent_1": 0.0, "agent_2": 0.0 }, discount={ "agent_0": 1.0, "agent_1": 1.0, "agent_2": 1.0 }, observation={ "agent_0": OLT( observation=[0.1, 0.3, 0.7], legal_actions=[1], terminal=[0.0], ), "agent_1": OLT( observation=[0.1, 0.3, 0.7], legal_actions=[1], terminal=[0.0], ), "agent_2": OLT( observation=[0.1, 0.3, 0.7], legal_actions=[1], terminal=[0.0], ), }, )
def _convert_observations( self, observes: Dict[str, np.ndarray], dones: Dict[str, bool] ) -> Dict[str, OLT]: observations: Dict[str, OLT] = {} for agent, observation in observes.items(): if isinstance(observation, dict) and "action_mask" in observation: legals = observation["action_mask"] observation = observation["observation"] else: # TODO Handle legal actions better for continuous envs, # maybe have min and max for each action and clip the agents actions # accordingly legals = np.ones( _convert_to_spec(self._environment.action_spaces[agent]).shape, dtype=self._environment.action_spaces[agent].dtype, ) observation = np.array(observation, dtype=np.float32) observations[agent] = OLT( observation=observation, legal_actions=legals, terminal=np.asarray([dones[agent]], dtype=np.float32), ) return observations
def get_seq_timesteps_1() -> TimeStep: return TimeStep( step_type=StepType.FIRST, reward=0.0, discount=1.0, observation=OLT(observation=[0.1, 0.3, 0.7], legal_actions=[1], terminal=[0.0]), )
def get_seq_timesteps_dict_2() -> Dict[str, SeqTimestepDict]: return { "agent_0": { "timestep": TimeStep( step_type=StepType.FIRST, reward=-1, discount=0.8, observation=OLT(observation=[0.1, 0.5, 0.7], legal_actions=[1], terminal=[0.0]), ), "action": 0, }, "agent_1": { "timestep": TimeStep( step_type=StepType.FIRST, reward=0.0, discount=0.8, observation=OLT(observation=[0.8, 0.3, 0.7], legal_actions=[1], terminal=[0.0]), ), "action": 2, }, "agent_2": { "timestep": TimeStep( step_type=StepType.FIRST, reward=1, discount=1.0, observation=OLT(observation=[0.9, 0.9, 0.8], legal_actions=[1], terminal=[0.0]), ), "action": 1, }, }
def observation_spec(self) -> Dict[str, OLT]: observation_specs = {} for agent in self._environment.agent_ids: observation_specs[agent] = OLT( observation=_convert_to_spec( self._environment.observation_spaces[agent] ), legal_actions=_convert_to_spec(self._environment.action_spaces[agent]), terminal=specs.Array((1,), np.float32), ) return observation_specs
def observation_spec(self) -> Dict[str, OLT]: observation_specs = {} for agent in self.agents: observation_specs[agent] = OLT( observation=tuple(( _convert_to_spec(self.observation_spaces[agent]), agent_info_spec(), )) if self._include_agent_info else _convert_to_spec( self.observation_spaces[agent]), legal_actions=_convert_to_spec(self.action_spaces[agent]), terminal=specs.Array((1, ), np.float32), ) return observation_specs
def observation_spec(self) -> OLT: if hasattr(self, "agent_selection"): active_agent = self.agent_selection else: active_agent = self.agents[0] return OLT( observation=super().observation_spec(), legal_actions=self.action_spec()[active_agent], terminal=specs.Array( (1, ), np.float32, ), )
def create_variables( network: snt.Module, input_spec: List[OLT], ) -> Optional[tf.TensorSpec]: """Builds the network with dummy inputs to create the necessary variables. Args: network: Sonnet Module whose variables are to be created. input_spec: list of input specs to the network. The length of this list should match the number of arguments expected by `network`. Returns: output_spec: only returns an output spec if the output is a tf.Tensor, else it doesn't return anything (None); e.g. if the output is a tfp.distributions.Distribution. """ # Create a dummy observation with no batch dimension. dummy_input = [ OLT( observation=zeros_like(in_spec.observation), legal_actions=ones_like(in_spec.legal_actions), terminal=zeros_like(in_spec.terminal), ) for in_spec in input_spec ] # If we have an RNNCore the hidden state will be an additional input. if isinstance(network, snt.RNNCore): initial_state = squeeze_batch_dim(network.initial_state(1)) dummy_input += [initial_state] # Forward pass of the network which will create variables as a side effect. dummy_output = network(*add_batch_dim(dummy_input)) # Evaluate the input signature by converting the dummy input into a # TensorSpec. We then save the signature as a property of the network. This is # done so that we can later use it when creating snapshots. We do this here # because the snapshot code may not have access to the precise form of the # inputs. input_signature = tree.map_structure( lambda t: tf.TensorSpec((None, ) + t.shape, t.dtype), dummy_input) network._input_signature = input_signature # pylint: disable=protected-access def spec(output: tf.Tensor) -> tf.TensorSpec: # If the output is not a Tensor, return None as spec is ill-defined. if not isinstance(output, tf.Tensor): return None # If this is not a scalar Tensor, make sure to squeeze out the batch dim. if tf.rank(output) > 0: output = squeeze_batch_dim(output) return tf.TensorSpec(output.shape, output.dtype) return tree.map_structure(spec, dummy_output)
def observation_spec(self) -> Observation: observation_specs = {} for agent in self.agents: legals = self.action_spec()[agent] terminal = specs.Array( (1, ), np.float32, ) observation_specs[agent] = OLT( observation=super().observation_spec(), legal_actions=legals, terminal=terminal, ) return observation_specs