def test_multinetworkbody_lstm(with_actions): torch.manual_seed(0) obs_size = 4 act_size = 2 seq_len = 16 n_agents = 3 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = [[0.1 * torch.ones((seq_len, obs_size))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((seq_len, 2)), [0.1 * torch.ones(seq_len) for _ in range(act_size)], ) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody( obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act, memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) else: encoded, _ = networkbody( obs_only=sample_obs, obs=[], actions=[], memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_multinetworkbody_num_agents(with_actions): torch.manual_seed(0) act_size = 2 obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size,)] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec ) sample_obs = [[0.1 * torch.ones((1, obs_size))]] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)] ) ] for n_agent, max_so_far in [(1, 1), (5, 5), (4, 5), (10, 10), (5, 10), (1, 10)]: if with_actions: encoded, _ = networkbody( obs_only=sample_obs * (n_agent - 1), obs=sample_obs, actions=sample_act ) else: encoded, _ = networkbody(obs_only=sample_obs * n_agent, obs=[], actions=[]) # look at the last value of the hidden units (the number of agents) target = (n_agent * 1.0 / max_so_far) * 2 - 1 assert abs(encoded[0, -1].item() - target) < 1e-6 assert encoded[0, -1].item() <= 1 assert encoded[0, -1].item() >= -1
def __init__( self, stream_names: List[str], observation_specs: List[ObservationSpec], network_settings: NetworkSettings, action_spec: ActionSpec, ): torch.nn.Module.__init__(self) self.network_body = MultiAgentNetworkBody(observation_specs, network_settings, action_spec) if network_settings.memory is not None: encoding_size = network_settings.memory.memory_size // 2 else: encoding_size = network_settings.hidden_units self.value_heads = ValueHeads(stream_names, encoding_size, 1)
def test_multinetworkbody_visual(with_actions): torch.manual_seed(0) act_size = 2 n_agents = 3 obs_size = 4 vis_obs_size = (84, 84, 3) network_settings = NetworkSettings() obs_shapes = [(obs_size, ), vis_obs_size] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = [[0.1 * torch.ones( (1, obs_size))] + [0.1 * torch.ones((1, 84, 84, 3))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction(0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)]) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody(obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act) else: encoded, _ = networkbody(obs_only=sample_obs, obs=[], actions=[]) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
class POCAValueNetwork(torch.nn.Module, Critic): """ The POCAValueNetwork uses the MultiAgentNetworkBody to compute the value and POCA baseline for a variable number of agents in a group that all share the same observation and action space. """ def __init__( self, stream_names: List[str], observation_specs: List[ObservationSpec], network_settings: NetworkSettings, action_spec: ActionSpec, ): torch.nn.Module.__init__(self) self.network_body = MultiAgentNetworkBody(observation_specs, network_settings, action_spec) if network_settings.memory is not None: encoding_size = network_settings.memory.memory_size // 2 else: encoding_size = network_settings.hidden_units self.value_heads = ValueHeads(stream_names, encoding_size, 1) @property def memory_size(self) -> int: return self.network_body.memory_size def update_normalization(self, buffer: AgentBuffer) -> None: self.network_body.update_normalization(buffer) def baseline( self, obs_without_actions: List[torch.Tensor], obs_with_actions: Tuple[List[List[torch.Tensor]], List[AgentAction]], memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: """ The POCA baseline marginalizes the action of the agent associated with self_obs. It calls the forward pass of the MultiAgentNetworkBody with the state action pairs of groupmates but just the state of the agent in question. :param obs_without_actions: The obs of the agent for which to compute the baseline. :param obs_with_actions: Tuple of observations and actions for all groupmates. :param memories: If using memory, a Tensor of initial memories. :param sequence_length: If using memory, the sequence length. :return: A Tuple of Dict of reward stream to tensor and critic memories. """ (obs, actions) = obs_with_actions encoding, memories = self.network_body( obs_only=[obs_without_actions], obs=obs, actions=actions, memories=memories, sequence_length=sequence_length, ) value_outputs, critic_mem_out = self.forward( encoding, memories, sequence_length) return value_outputs, critic_mem_out def critic_pass( self, obs: List[List[torch.Tensor]], memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]: """ A centralized value function. It calls the forward pass of MultiAgentNetworkBody with just the states of all agents. :param obs: List of observations for all agents in group :param memories: If using memory, a Tensor of initial memories. :param sequence_length: If using memory, the sequence length. :return: A Tuple of Dict of reward stream to tensor and critic memories. """ encoding, memories = self.network_body( obs_only=obs, obs=[], actions=[], memories=memories, sequence_length=sequence_length, ) value_outputs, critic_mem_out = self.forward( encoding, memories, sequence_length) return value_outputs, critic_mem_out def forward( self, encoding: torch.Tensor, memories: Optional[torch.Tensor] = None, sequence_length: int = 1, ) -> Tuple[torch.Tensor, torch.Tensor]: output = self.value_heads(encoding) return output, memories