def __init__(self, input_size, output_size, hyper_input_size, layer_size, num_layers): """ Hyper Network module. This module will use the hyper_input tensor to generate the weights of the main network. The main network is a single fully connected layer. :param input_size: The size of the input of the main network :param output_size: The size of the output of the main network :param hyper_input_size: The size of the input of the hypernetwork that will generate the main network. :param layer_size: The number of hidden units in the layers of the hypernetwork :param num_layers: The number of layers of the hypernetwork """ super().__init__() self.input_size = input_size self.output_size = output_size layer_in_size = hyper_input_size layers = [] for _ in range(num_layers): layers.append( linear_layer( layer_in_size, layer_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, bias_init=Initialization.Zero, )) layers.append(Swish()) layer_in_size = layer_size flat_output = linear_layer( layer_size, input_size * output_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, bias_init=Initialization.Zero, ) # Re-initializing the weights of the last layer of the hypernetwork bound = math.sqrt(1 / (layer_size * self.input_size)) flat_output.weight.data.uniform_(-bound, bound) self.hypernet = torch.nn.Sequential(*layers, LayerNorm(), flat_output) # The hypernetwork will not generate the bias of the main network layer self.bias = torch.nn.Parameter(torch.zeros(output_size))
def test_initialization_layer(): torch.manual_seed(0) # Test Zero layer = linear_layer( 3, 4, kernel_init=Initialization.Zero, bias_init=Initialization.Zero ) assert torch.all(torch.eq(layer.weight.data, torch.zeros_like(layer.weight.data))) assert torch.all(torch.eq(layer.bias.data, torch.zeros_like(layer.bias.data)))
def __init__(self, stream_names: List[str], input_size: int, output_size: int = 1): super().__init__() self.stream_names = stream_names _value_heads = {} for name in stream_names: value = linear_layer(input_size, output_size) _value_heads[name] = value self.value_heads = nn.ModuleDict(_value_heads)
def _create_policy_branches(self, hidden_size: int) -> nn.ModuleList: branches = [] for size in self.act_sizes: branch_output_layer = linear_layer( hidden_size, size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, bias_init=Initialization.Zero, ) branches.append(branch_output_layer) return nn.ModuleList(branches)
def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__() self._policy_specs = specs self._use_vail = settings.use_vail self._settings = settings encoder_settings = NetworkSettings( normalize=False, hidden_units=settings.encoding_size, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=None, ) self._action_flattener = ModelUtils.ActionFlattener(specs) unencoded_size = ( self._action_flattener.flattened_size + 1 if settings.use_actions else 0 ) # +1 is for dones self.encoder = NetworkBody( specs.observation_shapes, encoder_settings, unencoded_size ) estimator_input_size = settings.encoding_size if settings.use_vail: estimator_input_size = self.z_size self._z_sigma = torch.nn.Parameter( torch.ones((self.z_size), dtype=torch.float), requires_grad=True ) self._z_mu_layer = linear_layer( settings.encoding_size, self.z_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, ) self._beta = torch.nn.Parameter( torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False ) self._estimator = torch.nn.Sequential( linear_layer(estimator_input_size, 1), torch.nn.Sigmoid() )
def __init__(self, height: int, width: int, initial_channels: int, output_size: int): super().__init__() self.output_size = output_size self.input_size = height * width * initial_channels self.dense = nn.Sequential( linear_layer( self.input_size, self.output_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.41, # Use ReLU gain ), nn.LeakyReLU(), )
def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__() self._use_vail = settings.use_vail self._settings = settings encoder_settings = settings.network_settings if encoder_settings.memory is not None: encoder_settings.memory = None logger.warning( "memory was specified in network_settings but is not supported by GAIL. It is being ignored." ) self._action_flattener = ActionFlattener(specs.action_spec) unencoded_size = (self._action_flattener.flattened_size + 1 if settings.use_actions else 0) # +1 is for dones self.encoder = NetworkBody(specs.observation_specs, encoder_settings, unencoded_size) estimator_input_size = encoder_settings.hidden_units if settings.use_vail: estimator_input_size = self.z_size self._z_sigma = torch.nn.Parameter(torch.ones((self.z_size), dtype=torch.float), requires_grad=True) self._z_mu_layer = linear_layer( encoder_settings.hidden_units, self.z_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, ) self._beta = torch.nn.Parameter(torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False) self._estimator = torch.nn.Sequential( linear_layer(estimator_input_size, 1, kernel_gain=0.2), torch.nn.Sigmoid())
def __init__( self, embedding_size: int, entity_num_max_elements: List[int], num_heads: int = 4, ): super().__init__() self.entity_num_max_elements: List[int] = entity_num_max_elements self.max_num_ent = sum(entity_num_max_elements) self.attention = MultiHeadAttention(num_heads=num_heads, embedding_size=embedding_size) self.fc_q = linear_layer( embedding_size, embedding_size, kernel_init=Initialization.Normal, kernel_gain=(0.125 / embedding_size)**0.5, ) self.fc_k = linear_layer( embedding_size, embedding_size, kernel_init=Initialization.Normal, kernel_gain=(0.125 / embedding_size)**0.5, ) self.fc_v = linear_layer( embedding_size, embedding_size, kernel_init=Initialization.Normal, kernel_gain=(0.125 / embedding_size)**0.5, ) self.fc_out = linear_layer( embedding_size, embedding_size, kernel_init=Initialization.Normal, kernel_gain=(0.125 / embedding_size)**0.5, )
def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() self._action_spec = specs.action_spec state_encoder_settings = settings.network_settings if state_encoder_settings.memory is not None: state_encoder_settings.memory = None logger.warning( "memory was specified in network_settings but is not supported by Curiosity. It is being ignored." ) self._state_encoder = NetworkBody(specs.observation_specs, state_encoder_settings) self._action_flattener = ActionFlattener(self._action_spec) self.inverse_model_action_encoding = torch.nn.Sequential( LinearEncoder(2 * state_encoder_settings.hidden_units, 1, 256)) if self._action_spec.continuous_size > 0: self.continuous_action_prediction = linear_layer( 256, self._action_spec.continuous_size) if self._action_spec.discrete_size > 0: self.discrete_action_prediction = linear_layer( 256, sum(self._action_spec.discrete_branches)) self.forward_model_next_state_prediction = torch.nn.Sequential( LinearEncoder( state_encoder_settings.hidden_units + self._action_flattener.flattened_size, 1, 256, ), linear_layer(256, state_encoder_settings.hidden_units), )
def __init__( self, input_size: int, goal_size: int, hidden_size: int, num_layers: int, num_conditional_layers: int, kernel_init: Initialization = Initialization.KaimingHeNormal, kernel_gain: float = 1.0, ): """ ConditionalEncoder module. A fully connected network of which some of the weights are generated by a goal conditioning. Uses the HyperNetwork module to generate the weights of the network. Only the weights of the last "num_conditional_layers" layers will be generated by HyperNetworks, the others will use regular parameters. :param input_size: The size of the input of the encoder :param goal_size: The size of the goal tensor that will condition the encoder :param hidden_size: The number of hidden units in the encoder :param num_layers: The total number of layers of the encoder (both regular and generated by HyperNetwork) :param num_conditional_layers: The number of layers generated with hypernetworks :param kernel_init: The Initialization to use for the weights of the layer :param kernel_gain: The multiplier for the weights of the kernel. """ super().__init__() layers: List[torch.nn.Module] = [] prev_size = input_size + goal_size for i in range(num_layers): if num_layers - i <= num_conditional_layers: # This means layer i is a conditional layer since the conditional # leyers are the last num_conditional_layers layers.append( HyperNetwork(prev_size, hidden_size, goal_size, hidden_size, 2)) else: layers.append( linear_layer( prev_size, hidden_size, kernel_init=kernel_init, kernel_gain=kernel_gain, )) layers.append(Swish()) prev_size = hidden_size self.layers = torch.nn.ModuleList(layers)
def test_simple_transformer_training(): np.random.seed(1336) torch.manual_seed(1336) size, n_k, = 3, 5 embedding_size = 64 entity_embeddings = EntityEmbeddings(size, [size], [n_k], embedding_size) transformer = ResidualSelfAttention(embedding_size, [n_k]) l_layer = linear_layer(embedding_size, size) optimizer = torch.optim.Adam(list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001) batch_size = 200 point_range = 3 init_error = -1.0 for _ in range(250): center = torch.rand((batch_size, size)) * point_range * 2 - point_range key = torch.rand( (batch_size, n_k, size)) * point_range * 2 - point_range with torch.no_grad(): # create the target : The key closest to the query in euclidean distance distance = torch.sum((center.reshape( (batch_size, 1, size)) - key)**2, dim=2) argmin = torch.argmin(distance, dim=1) target = [] for i in range(batch_size): target += [key[i, argmin[i], :]] target = torch.stack(target, dim=0) target = target.detach() embeddings = entity_embeddings(center, [key]) masks = EntityEmbeddings.get_masks([key]) prediction = transformer.forward(embeddings, masks) prediction = l_layer(prediction) prediction = prediction.reshape((batch_size, size)) error = torch.mean((prediction - target)**2, dim=1) error = torch.mean(error) / 2 if init_error == -1.0: init_error = error.item() else: assert error.item() < init_error print(error.item()) optimizer.zero_grad() error.backward() optimizer.step() assert error.item() < 0.3
def test_predict_closest_training(): np.random.seed(1336) torch.manual_seed(1336) size, n_k, = 3, 5 embedding_size = 64 entity_embeddings = EntityEmbedding(size, n_k, embedding_size) entity_embeddings.add_self_embedding(size) transformer = ResidualSelfAttention(embedding_size, n_k) l_layer = linear_layer(embedding_size, size) optimizer = torch.optim.Adam( list(entity_embeddings.parameters()) + list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001, weight_decay=1e-6, ) batch_size = 200 for _ in range(200): center = torch.rand((batch_size, size)) key = torch.rand((batch_size, n_k, size)) with torch.no_grad(): # create the target : The key closest to the query in euclidean distance distance = torch.sum((center.reshape( (batch_size, 1, size)) - key)**2, dim=2) argmin = torch.argmin(distance, dim=1) target = [] for i in range(batch_size): target += [key[i, argmin[i], :]] target = torch.stack(target, dim=0) target = target.detach() embeddings = entity_embeddings(center, key) masks = get_zero_entities_mask([key]) prediction = transformer.forward(embeddings, masks) prediction = l_layer(prediction) prediction = prediction.reshape((batch_size, size)) error = torch.mean((prediction - target)**2, dim=1) error = torch.mean(error) / 2 print(error.item()) optimizer.zero_grad() error.backward() optimizer.step() assert error.item() < 0.02
def test_all_masking(mask_value): # We make sure that a mask of all zeros or all ones will not trigger an error np.random.seed(1336) torch.manual_seed(1336) size, n_k, = 3, 5 embedding_size = 64 entity_embeddings = EntityEmbedding(size, n_k, embedding_size) entity_embeddings.add_self_embedding(size) transformer = ResidualSelfAttention(embedding_size, n_k) l_layer = linear_layer(embedding_size, size) optimizer = torch.optim.Adam( list(entity_embeddings.parameters()) + list(transformer.parameters()) + list(l_layer.parameters()), lr=0.001, weight_decay=1e-6, ) batch_size = 20 for _ in range(5): center = torch.rand((batch_size, size)) key = torch.rand((batch_size, n_k, size)) with torch.no_grad(): # create the target : The key closest to the query in euclidean distance distance = torch.sum((center.reshape( (batch_size, 1, size)) - key)**2, dim=2) argmin = torch.argmin(distance, dim=1) target = [] for i in range(batch_size): target += [key[i, argmin[i], :]] target = torch.stack(target, dim=0) target = target.detach() embeddings = entity_embeddings(center, key) masks = [torch.ones_like(key[:, :, 0]) * mask_value] prediction = transformer.forward(embeddings, masks) prediction = l_layer(prediction) prediction = prediction.reshape((batch_size, size)) error = torch.mean((prediction - target)**2, dim=1) error = torch.mean(error) / 2 optimizer.zero_grad() error.backward() optimizer.step()
def __init__(self, height, width, initial_channels, final_hidden): super().__init__() n_channels = [16, 32, 32] # channel for each stack n_blocks = 2 # number of residual blocks self.layers = [] last_channel = initial_channels for _, channel in enumerate(n_channels): self.layers.append( nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1)) self.layers.append(nn.MaxPool2d([3, 3], [2, 2])) height, width = pool_out_shape((height, width), 3) for _ in range(n_blocks): self.layers.append(ResNetBlock(channel)) last_channel = channel self.layers.append(Swish()) self.dense = linear_layer( n_channels[-1] * height * width, final_hidden, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, )
def __init__(self, height: int, width: int, initial_channels: int, output_size: int): super().__init__() self.h_size = output_size conv_1_hw = conv_output_shape((height, width), 8, 4) conv_2_hw = conv_output_shape(conv_1_hw, 4, 2) self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 32 self.conv_layers = nn.Sequential( nn.Conv2d(initial_channels, 16, [8, 8], [4, 4]), nn.LeakyReLU(), nn.Conv2d(16, 32, [4, 4], [2, 2]), nn.LeakyReLU(), ) self.dense = nn.Sequential( linear_layer( self.final_flat, self.h_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.0, ), nn.LeakyReLU(), )
def __init__(self, height: int, width: int, initial_channels: int, output_size: int): super().__init__() self.h_size = output_size conv_1_hw = conv_output_shape((height, width), 3, 1) conv_2_hw = conv_output_shape(conv_1_hw, 3, 1) self.final_flat = conv_2_hw[0] * conv_2_hw[1] * 144 self.conv_layers = nn.Sequential( nn.Conv2d(initial_channels, 35, [3, 3], [1, 1]), nn.LeakyReLU(), nn.Conv2d(35, 144, [3, 3], [1, 1]), nn.LeakyReLU(), ) self.dense = nn.Sequential( linear_layer( self.final_flat, self.h_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.41, # Use ReLU gain ), nn.LeakyReLU(), )
def __init__(self, height: int, width: int, initial_channels: int, output_size: int): super().__init__() n_channels = [16, 32, 32] # channel for each stack n_blocks = 2 # number of residual blocks layers = [] last_channel = initial_channels for _, channel in enumerate(n_channels): layers.append( nn.Conv2d(last_channel, channel, [3, 3], [1, 1], padding=1)) layers.append(nn.MaxPool2d([3, 3], [2, 2])) height, width = pool_out_shape((height, width), 3) for _ in range(n_blocks): layers.append(ResNetBlock(channel)) last_channel = channel layers.append(Swish()) self.dense = linear_layer( n_channels[-1] * height * width, output_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=1.41, # Use ReLU gain ) self.sequential = nn.Sequential(*layers)