예제 #1
0
    def __init__(self, input_size, output_size, hyper_input_size, layer_size,
                 num_layers):
        """
        Hyper Network module. This module will use the hyper_input tensor to generate
        the weights of the main network. The main network is a single fully connected
        layer.
        :param input_size: The size of the input of the main network
        :param output_size: The size of the output of the main network
        :param hyper_input_size: The size of the input of the hypernetwork that will
        generate the main network.
        :param layer_size: The number of hidden units in the layers of the hypernetwork
        :param num_layers: The number of layers of the hypernetwork
        """
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size

        layer_in_size = hyper_input_size
        layers = []
        for _ in range(num_layers):
            layers.append(
                linear_layer(
                    layer_in_size,
                    layer_size,
                    kernel_init=Initialization.KaimingHeNormal,
                    kernel_gain=1.0,
                    bias_init=Initialization.Zero,
                ))
            layers.append(Swish())
            layer_in_size = layer_size
        flat_output = linear_layer(
            layer_size,
            input_size * output_size,
            kernel_init=Initialization.KaimingHeNormal,
            kernel_gain=0.1,
            bias_init=Initialization.Zero,
        )

        # Re-initializing the weights of the last layer of the hypernetwork
        bound = math.sqrt(1 / (layer_size * self.input_size))
        flat_output.weight.data.uniform_(-bound, bound)

        self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output)

        # The hypernetwork will not generate the bias of the main network layer
        self.bias = torch.nn.Parameter(torch.zeros(output_size))
예제 #2
0
def test_initialization_layer():
    torch.manual_seed(0)
    # Test Zero
    layer = linear_layer(
        3, 4, kernel_init=Initialization.Zero, bias_init=Initialization.Zero
    )
    assert torch.all(torch.eq(layer.weight.data, torch.zeros_like(layer.weight.data)))
    assert torch.all(torch.eq(layer.bias.data, torch.zeros_like(layer.bias.data)))
예제 #3
0
    def __init__(self,
                 stream_names: List[str],
                 input_size: int,
                 output_size: int = 1):
        super().__init__()
        self.stream_names = stream_names
        _value_heads = {}

        for name in stream_names:
            value = linear_layer(input_size, output_size)
            _value_heads[name] = value
        self.value_heads = nn.ModuleDict(_value_heads)
예제 #4
0
 def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList:
     branches = []
     for size in self.act_sizes:
         branch_output_layer = linear_layer(
             hidden_size,
             size,
             kernel_init=Initialization.KaimingHeNormal,
             kernel_gain=0.1,
             bias_init=Initialization.Zero,
         )
         branches.append(branch_output_layer)
     return nn.ModuleList(branches)
예제 #5
0
    def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
        super().__init__()
        self._policy_specs = specs
        self._use_vail = settings.use_vail
        self._settings = settings

        encoder_settings = NetworkSettings(
            normalize=False,
            hidden_units=settings.encoding_size,
            num_layers=2,
            vis_encode_type=EncoderType.SIMPLE,
            memory=None,
        )
        self._action_flattener = ModelUtils.ActionFlattener(specs)
        unencoded_size = (
            self._action_flattener.flattened_size + 1 if settings.use_actions else 0
        )  # +1 is for dones
        self.encoder = NetworkBody(
            specs.observation_shapes, encoder_settings, unencoded_size
        )

        estimator_input_size = settings.encoding_size
        if settings.use_vail:
            estimator_input_size = self.z_size
            self._z_sigma = torch.nn.Parameter(
                torch.ones((self.z_size), dtype=torch.float), requires_grad=True
            )
            self._z_mu_layer = linear_layer(
                settings.encoding_size,
                self.z_size,
                kernel_init=Initialization.KaimingHeNormal,
                kernel_gain=0.1,
            )
            self._beta = torch.nn.Parameter(
                torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False
            )

        self._estimator = torch.nn.Sequential(
            linear_layer(estimator_input_size, 1), torch.nn.Sigmoid()
        )
예제 #6
0
 def __init__(self, height: int, width: int, initial_channels: int,
              output_size: int):
     super().__init__()
     self.output_size = output_size
     self.input_size = height * width * initial_channels
     self.dense = nn.Sequential(
         linear_layer(
             self.input_size,
             self.output_size,
             kernel_init=Initialization.KaimingHeNormal,
             kernel_gain=1.41,  # Use ReLU gain
         ),
         nn.LeakyReLU(),
     )
예제 #7
0
    def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
        super().__init__()
        self._use_vail = settings.use_vail
        self._settings = settings

        encoder_settings = settings.network_settings
        if encoder_settings.memory is not None:
            encoder_settings.memory = None
            logger.warning(
                "memory was specified in network_settings but is not supported by GAIL. It is being ignored."
            )

        self._action_flattener = ActionFlattener(specs.action_spec)
        unencoded_size = (self._action_flattener.flattened_size +
                          1 if settings.use_actions else 0)  # +1 is for dones
        self.encoder = NetworkBody(specs.observation_specs, encoder_settings,
                                   unencoded_size)

        estimator_input_size = encoder_settings.hidden_units
        if settings.use_vail:
            estimator_input_size = self.z_size
            self._z_sigma = torch.nn.Parameter(torch.ones((self.z_size),
                                                          dtype=torch.float),
                                               requires_grad=True)
            self._z_mu_layer = linear_layer(
                encoder_settings.hidden_units,
                self.z_size,
                kernel_init=Initialization.KaimingHeNormal,
                kernel_gain=0.1,
            )
            self._beta = torch.nn.Parameter(torch.tensor(self.initial_beta,
                                                         dtype=torch.float),
                                            requires_grad=False)

        self._estimator = torch.nn.Sequential(
            linear_layer(estimator_input_size, 1, kernel_gain=0.2),
            torch.nn.Sigmoid())
예제 #8
0
    def __init__(
        self,
        embedding_size: int,
        entity_num_max_elements: List[int],
        num_heads: int = 4,
    ):
        super().__init__()
        self.entity_num_max_elements: List[int] = entity_num_max_elements
        self.max_num_ent = sum(entity_num_max_elements)
        self.attention = MultiHeadAttention(num_heads=num_heads,
                                            embedding_size=embedding_size)

        self.fc_q = linear_layer(
            embedding_size,
            embedding_size,
            kernel_init=Initialization.Normal,
            kernel_gain=(0.125 / embedding_size)**0.5,
        )
        self.fc_k = linear_layer(
            embedding_size,
            embedding_size,
            kernel_init=Initialization.Normal,
            kernel_gain=(0.125 / embedding_size)**0.5,
        )
        self.fc_v = linear_layer(
            embedding_size,
            embedding_size,
            kernel_init=Initialization.Normal,
            kernel_gain=(0.125 / embedding_size)**0.5,
        )
        self.fc_out = linear_layer(
            embedding_size,
            embedding_size,
            kernel_init=Initialization.Normal,
            kernel_gain=(0.125 / embedding_size)**0.5,
        )
예제 #9
0
    def __init__(self, specs: BehaviorSpec,
                 settings: CuriositySettings) -> None:
        super().__init__()
        self._action_spec = specs.action_spec

        state_encoder_settings = settings.network_settings
        if state_encoder_settings.memory is not None:
            state_encoder_settings.memory = None
            logger.warning(
                "memory was specified in network_settings but is not supported by Curiosity. It is being ignored."
            )

        self._state_encoder = NetworkBody(specs.observation_specs,
                                          state_encoder_settings)

        self._action_flattener = ActionFlattener(self._action_spec)

        self.inverse_model_action_encoding = torch.nn.Sequential(
            LinearEncoder(2 * state_encoder_settings.hidden_units, 1, 256))

        if self._action_spec.continuous_size > 0:
            self.continuous_action_prediction = linear_layer(
                256, self._action_spec.continuous_size)
        if self._action_spec.discrete_size > 0:
            self.discrete_action_prediction = linear_layer(
                256, sum(self._action_spec.discrete_branches))

        self.forward_model_next_state_prediction = torch.nn.Sequential(
            LinearEncoder(
                state_encoder_settings.hidden_units +
                self._action_flattener.flattened_size,
                1,
                256,
            ),
            linear_layer(256, state_encoder_settings.hidden_units),
        )
예제 #10
0
 def __init__(
     self,
     input_size: int,
     goal_size: int,
     hidden_size: int,
     num_layers: int,
     num_conditional_layers: int,
     kernel_init: Initialization = Initialization.KaimingHeNormal,
     kernel_gain: float = 1.0,
 ):
     """
     ConditionalEncoder module. A fully connected network of which some of the
     weights are generated by a goal conditioning. Uses the HyperNetwork module to
     generate the weights of the network. Only the weights of the last
     "num_conditional_layers" layers will be generated by HyperNetworks, the others
     will use regular parameters.
     :param input_size: The size of the input of the encoder
     :param goal_size: The size of the goal tensor that will condition the encoder
     :param hidden_size: The number of hidden units in the encoder
     :param num_layers: The total number of layers of the encoder (both regular and
     generated by HyperNetwork)
     :param num_conditional_layers: The number of layers generated with hypernetworks
     :param kernel_init: The Initialization to use for the weights of the layer
     :param kernel_gain: The multiplier for the weights of the kernel.
     """
     super().__init__()
     layers: List[torch.nn.Module] = []
     prev_size = input_size + goal_size
     for i in range(num_layers):
         if num_layers - i <= num_conditional_layers:
             # This means layer i is a conditional layer since the conditional
             # leyers are the last num_conditional_layers
             layers.append(
                 HyperNetwork(prev_size, hidden_size, goal_size,
                              hidden_size, 2))
         else:
             layers.append(
                 linear_layer(
                     prev_size,
                     hidden_size,
                     kernel_init=kernel_init,
                     kernel_gain=kernel_gain,
                 ))
         layers.append(Swish())
         prev_size = hidden_size
     self.layers = torch.nn.ModuleList(layers)
예제 #11
0
def test_simple_transformer_training():
    np.random.seed(1336)
    torch.manual_seed(1336)
    size, n_k, = 3, 5
    embedding_size = 64
    entity_embeddings = EntityEmbeddings(size, [size], [n_k], embedding_size)
    transformer = ResidualSelfAttention(embedding_size, [n_k])
    l_layer = linear_layer(embedding_size, size)
    optimizer = torch.optim.Adam(list(transformer.parameters()) +
                                 list(l_layer.parameters()),
                                 lr=0.001)
    batch_size = 200
    point_range = 3
    init_error = -1.0
    for _ in range(250):
        center = torch.rand((batch_size, size)) * point_range * 2 - point_range
        key = torch.rand(
            (batch_size, n_k, size)) * point_range * 2 - point_range
        with torch.no_grad():
            # create the target : The key closest to the query in euclidean distance
            distance = torch.sum((center.reshape(
                (batch_size, 1, size)) - key)**2,
                                 dim=2)
            argmin = torch.argmin(distance, dim=1)
            target = []
            for i in range(batch_size):
                target += [key[i, argmin[i], :]]
            target = torch.stack(target, dim=0)
            target = target.detach()

        embeddings = entity_embeddings(center, [key])
        masks = EntityEmbeddings.get_masks([key])
        prediction = transformer.forward(embeddings, masks)
        prediction = l_layer(prediction)
        prediction = prediction.reshape((batch_size, size))
        error = torch.mean((prediction - target)**2, dim=1)
        error = torch.mean(error) / 2
        if init_error == -1.0:
            init_error = error.item()
        else:
            assert error.item() < init_error
        print(error.item())
        optimizer.zero_grad()
        error.backward()
        optimizer.step()
    assert error.item() < 0.3
예제 #12
0
def test_predict_closest_training():
    np.random.seed(1336)
    torch.manual_seed(1336)
    size, n_k, = 3, 5
    embedding_size = 64
    entity_embeddings = EntityEmbedding(size, n_k, embedding_size)
    entity_embeddings.add_self_embedding(size)
    transformer = ResidualSelfAttention(embedding_size, n_k)
    l_layer = linear_layer(embedding_size, size)
    optimizer = torch.optim.Adam(
        list(entity_embeddings.parameters()) + list(transformer.parameters()) +
        list(l_layer.parameters()),
        lr=0.001,
        weight_decay=1e-6,
    )
    batch_size = 200
    for _ in range(200):
        center = torch.rand((batch_size, size))
        key = torch.rand((batch_size, n_k, size))
        with torch.no_grad():
            # create the target : The key closest to the query in euclidean distance
            distance = torch.sum((center.reshape(
                (batch_size, 1, size)) - key)**2,
                                 dim=2)
            argmin = torch.argmin(distance, dim=1)
            target = []
            for i in range(batch_size):
                target += [key[i, argmin[i], :]]
            target = torch.stack(target, dim=0)
            target = target.detach()

        embeddings = entity_embeddings(center, key)
        masks = get_zero_entities_mask([key])
        prediction = transformer.forward(embeddings, masks)
        prediction = l_layer(prediction)
        prediction = prediction.reshape((batch_size, size))
        error = torch.mean((prediction - target)**2, dim=1)
        error = torch.mean(error) / 2
        print(error.item())
        optimizer.zero_grad()
        error.backward()
        optimizer.step()
    assert error.item() < 0.02
예제 #13
0
def test_all_masking(mask_value):
    # We make sure that a mask of all zeros or all ones will not trigger an error
    np.random.seed(1336)
    torch.manual_seed(1336)
    size, n_k, = 3, 5
    embedding_size = 64
    entity_embeddings = EntityEmbedding(size, n_k, embedding_size)
    entity_embeddings.add_self_embedding(size)
    transformer = ResidualSelfAttention(embedding_size, n_k)
    l_layer = linear_layer(embedding_size, size)
    optimizer = torch.optim.Adam(
        list(entity_embeddings.parameters()) + list(transformer.parameters()) +
        list(l_layer.parameters()),
        lr=0.001,
        weight_decay=1e-6,
    )
    batch_size = 20
    for _ in range(5):
        center = torch.rand((batch_size, size))
        key = torch.rand((batch_size, n_k, size))
        with torch.no_grad():
            # create the target : The key closest to the query in euclidean distance
            distance = torch.sum((center.reshape(
                (batch_size, 1, size)) - key)**2,
                                 dim=2)
            argmin = torch.argmin(distance, dim=1)
            target = []
            for i in range(batch_size):
                target += [key[i, argmin[i], :]]
            target = torch.stack(target, dim=0)
            target = target.detach()

        embeddings = entity_embeddings(center, key)
        masks = [torch.ones_like(key[:, :, 0]) * mask_value]
        prediction = transformer.forward(embeddings, masks)
        prediction = l_layer(prediction)
        prediction = prediction.reshape((batch_size, size))
        error = torch.mean((prediction - target)**2, dim=1)
        error = torch.mean(error) / 2
        optimizer.zero_grad()
        error.backward()
        optimizer.step()
예제 #14
0
 def __init__(self, height, width, initial_channels, final_hidden):
     super().__init__()
     n_channels = [16, 32, 32]  # channel for each stack
     n_blocks = 2  # number of residual blocks
     self.layers = []
     last_channel = initial_channels
     for _, channel in enumerate(n_channels):
         self.layers.append(
             nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1))
         self.layers.append(nn.MaxPool2d([3, 3], [2, 2]))
         height, width = pool_out_shape((height, width), 3)
         for _ in range(n_blocks):
             self.layers.append(ResNetBlock(channel))
         last_channel = channel
     self.layers.append(Swish())
     self.dense = linear_layer(
         n_channels[-1] * height * width,
         final_hidden,
         kernel_init=Initialization.KaimingHeNormal,
         kernel_gain=1.0,
     )
예제 #15
0
    def __init__(self, height: int, width: int, initial_channels: int,
                 output_size: int):
        super().__init__()
        self.h_size = output_size
        conv_1_hw = conv_output_shape((height, width), 8, 4)
        conv_2_hw = conv_output_shape(conv_1_hw, 4, 2)
        self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32

        self.conv_layers = nn.Sequential(
            nn.Conv2d(initial_channels, 16, [8, 8], [4, 4]),
            nn.LeakyReLU(),
            nn.Conv2d(16, 32, [4, 4], [2, 2]),
            nn.LeakyReLU(),
        )
        self.dense = nn.Sequential(
            linear_layer(
                self.final_flat,
                self.h_size,
                kernel_init=Initialization.KaimingHeNormal,
                kernel_gain=1.0,
            ),
            nn.LeakyReLU(),
        )
예제 #16
0
    def __init__(self, height: int, width: int, initial_channels: int,
                 output_size: int):
        super().__init__()
        self.h_size = output_size
        conv_1_hw = conv_output_shape((height, width), 3, 1)
        conv_2_hw = conv_output_shape(conv_1_hw, 3, 1)
        self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 144

        self.conv_layers = nn.Sequential(
            nn.Conv2d(initial_channels, 35, [3, 3], [1, 1]),
            nn.LeakyReLU(),
            nn.Conv2d(35, 144, [3, 3], [1, 1]),
            nn.LeakyReLU(),
        )
        self.dense = nn.Sequential(
            linear_layer(
                self.final_flat,
                self.h_size,
                kernel_init=Initialization.KaimingHeNormal,
                kernel_gain=1.41,  # Use ReLU gain
            ),
            nn.LeakyReLU(),
        )
예제 #17
0
 def __init__(self, height: int, width: int, initial_channels: int,
              output_size: int):
     super().__init__()
     n_channels = [16, 32, 32]  # channel for each stack
     n_blocks = 2  # number of residual blocks
     layers = []
     last_channel = initial_channels
     for _, channel in enumerate(n_channels):
         layers.append(
             nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1))
         layers.append(nn.MaxPool2d([3, 3], [2, 2]))
         height, width = pool_out_shape((height, width), 3)
         for _ in range(n_blocks):
             layers.append(ResNetBlock(channel))
         last_channel = channel
     layers.append(Swish())
     self.dense = linear_layer(
         n_channels[-1] * height * width,
         output_size,
         kernel_init=Initialization.KaimingHeNormal,
         kernel_gain=1.41,  # Use ReLU gain
     )
     self.sequential = nn.Sequential(*layers)