예제 #1
0
def test_multinetworkbody_lstm(with_actions):
    torch.manual_seed(0)
    obs_size = 4
    act_size = 2
    seq_len = 16
    n_agents = 3
    network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings(
        sequence_length=seq_len, memory_size=12))

    obs_shapes = [(obs_size, )]
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    networkbody = MultiAgentNetworkBody(
        create_observation_specs_with_shapes(obs_shapes), network_settings,
        action_spec)
    optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4)
    sample_obs = [[0.1 * torch.ones((seq_len, obs_size))]
                  for _ in range(n_agents)]
    # simulate baseline in POCA
    sample_act = [
        AgentAction(
            0.1 * torch.ones((seq_len, 2)),
            [0.1 * torch.ones(seq_len) for _ in range(act_size)],
        ) for _ in range(n_agents - 1)
    ]

    for _ in range(300):
        if with_actions:
            encoded, _ = networkbody(
                obs_only=sample_obs[:1],
                obs=sample_obs[1:],
                actions=sample_act,
                memories=torch.ones(1, 1, 12),
                sequence_length=seq_len,
            )
        else:
            encoded, _ = networkbody(
                obs_only=sample_obs,
                obs=[],
                actions=[],
                memories=torch.ones(1, 1, 12),
                sequence_length=seq_len,
            )
        # Try to force output to 1
        loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for _enc in encoded.flatten().tolist():
        assert _enc == pytest.approx(1.0, abs=0.1)
예제 #2
0
def test_multinetworkbody_num_agents(with_actions):
    torch.manual_seed(0)
    act_size = 2
    obs_size = 4
    network_settings = NetworkSettings()
    obs_shapes = [(obs_size,)]
    action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size)))
    networkbody = MultiAgentNetworkBody(
        create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec
    )
    sample_obs = [[0.1 * torch.ones((1, obs_size))]]
    # simulate baseline in POCA
    sample_act = [
        AgentAction(
            0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)]
        )
    ]
    for n_agent, max_so_far in [(1, 1), (5, 5), (4, 5), (10, 10), (5, 10), (1, 10)]:
        if with_actions:
            encoded, _ = networkbody(
                obs_only=sample_obs * (n_agent - 1), obs=sample_obs, actions=sample_act
            )
        else:
            encoded, _ = networkbody(obs_only=sample_obs * n_agent, obs=[], actions=[])
        # look at the last value of the hidden units (the number of agents)
        target = (n_agent * 1.0 / max_so_far) * 2 - 1
        assert abs(encoded[0, -1].item() - target) < 1e-6
        assert encoded[0, -1].item() <= 1
        assert encoded[0, -1].item() >= -1
예제 #3
0
        def __init__(
            self,
            stream_names: List[str],
            observation_specs: List[ObservationSpec],
            network_settings: NetworkSettings,
            action_spec: ActionSpec,
        ):
            torch.nn.Module.__init__(self)
            self.network_body = MultiAgentNetworkBody(observation_specs,
                                                      network_settings,
                                                      action_spec)
            if network_settings.memory is not None:
                encoding_size = network_settings.memory.memory_size // 2
            else:
                encoding_size = network_settings.hidden_units

            self.value_heads = ValueHeads(stream_names, encoding_size, 1)
예제 #4
0
def test_multinetworkbody_visual(with_actions):
    torch.manual_seed(0)
    act_size = 2
    n_agents = 3
    obs_size = 4
    vis_obs_size = (84, 84, 3)
    network_settings = NetworkSettings()
    obs_shapes = [(obs_size, ), vis_obs_size]
    action_spec = ActionSpec(act_size,
                             tuple(act_size for _ in range(act_size)))
    networkbody = MultiAgentNetworkBody(
        create_observation_specs_with_shapes(obs_shapes), network_settings,
        action_spec)
    optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3)
    sample_obs = [[0.1 * torch.ones(
        (1, obs_size))] + [0.1 * torch.ones((1, 84, 84, 3))]
                  for _ in range(n_agents)]
    # simulate baseline in POCA
    sample_act = [
        AgentAction(0.1 * torch.ones((1, 2)),
                    [0.1 * torch.ones(1) for _ in range(act_size)])
        for _ in range(n_agents - 1)
    ]
    for _ in range(300):
        if with_actions:
            encoded, _ = networkbody(obs_only=sample_obs[:1],
                                     obs=sample_obs[1:],
                                     actions=sample_act)
        else:
            encoded, _ = networkbody(obs_only=sample_obs, obs=[], actions=[])

        assert encoded.shape == (1, network_settings.hidden_units)
        # Try to force output to 1
        loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # In the last step, values should be close to 1
    for _enc in encoded.flatten().tolist():
        assert _enc == pytest.approx(1.0, abs=0.1)
예제 #5
0
    class POCAValueNetwork(torch.nn.Module, Critic):
        """
        The POCAValueNetwork uses the MultiAgentNetworkBody to compute the value
        and POCA baseline for a variable number of agents in a group that all
        share the same observation and action space.
        """
        def __init__(
            self,
            stream_names: List[str],
            observation_specs: List[ObservationSpec],
            network_settings: NetworkSettings,
            action_spec: ActionSpec,
        ):
            torch.nn.Module.__init__(self)
            self.network_body = MultiAgentNetworkBody(observation_specs,
                                                      network_settings,
                                                      action_spec)
            if network_settings.memory is not None:
                encoding_size = network_settings.memory.memory_size // 2
            else:
                encoding_size = network_settings.hidden_units

            self.value_heads = ValueHeads(stream_names, encoding_size, 1)

        @property
        def memory_size(self) -> int:
            return self.network_body.memory_size

        def update_normalization(self, buffer: AgentBuffer) -> None:
            self.network_body.update_normalization(buffer)

        def baseline(
            self,
            obs_without_actions: List[torch.Tensor],
            obs_with_actions: Tuple[List[List[torch.Tensor]],
                                    List[AgentAction]],
            memories: Optional[torch.Tensor] = None,
            sequence_length: int = 1,
        ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
            """
            The POCA baseline marginalizes the action of the agent associated with self_obs.
            It calls the forward pass of the MultiAgentNetworkBody with the state action
            pairs of groupmates but just the state of the agent in question.
            :param obs_without_actions: The obs of the agent for which to compute the baseline.
            :param obs_with_actions: Tuple of observations and actions for all groupmates.
            :param memories: If using memory, a Tensor of initial memories.
            :param sequence_length: If using memory, the sequence length.

            :return: A Tuple of Dict of reward stream to tensor and critic memories.
            """
            (obs, actions) = obs_with_actions
            encoding, memories = self.network_body(
                obs_only=[obs_without_actions],
                obs=obs,
                actions=actions,
                memories=memories,
                sequence_length=sequence_length,
            )
            value_outputs, critic_mem_out = self.forward(
                encoding, memories, sequence_length)
            return value_outputs, critic_mem_out

        def critic_pass(
            self,
            obs: List[List[torch.Tensor]],
            memories: Optional[torch.Tensor] = None,
            sequence_length: int = 1,
        ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:
            """
            A centralized value function. It calls the forward pass of MultiAgentNetworkBody
            with just the states of all agents.
            :param obs: List of observations for all agents in group
            :param memories: If using memory, a Tensor of initial memories.
            :param sequence_length: If using memory, the sequence length.
            :return: A Tuple of Dict of reward stream to tensor and critic memories.
            """
            encoding, memories = self.network_body(
                obs_only=obs,
                obs=[],
                actions=[],
                memories=memories,
                sequence_length=sequence_length,
            )
            value_outputs, critic_mem_out = self.forward(
                encoding, memories, sequence_length)
            return value_outputs, critic_mem_out

        def forward(
            self,
            encoding: torch.Tensor,
            memories: Optional[torch.Tensor] = None,
            sequence_length: int = 1,
        ) -> Tuple[torch.Tensor, torch.Tensor]:

            output = self.value_heads(encoding)
            return output, memories