def test_LayerNorm(): from speechbrain.nnet.normalization import LayerNorm input = torch.randn(4, 101, 256) + 2.0 norm = LayerNorm(input_shape=input.shape) output = norm(input) assert input.shape == output.shape current_mean = output.mean(dim=2).mean() assert torch.abs(current_mean) < 1e-06 current_std = output.std(dim=2).mean() assert torch.abs(1.0 - current_std) < 0.01 input = torch.randn(100, 101, 16, 32) + 2.0 norm = LayerNorm(input_shape=input.shape) output = norm(input) assert input.shape == output.shape current_mean = output.mean(dim=[2, 3]).mean() assert torch.abs(current_mean) < 1e-06 current_std = output.std(dim=[2, 3]).mean() assert torch.abs(1.0 - current_std) < 0.01 assert torch.jit.trace(norm, input)
def __init__( self, vocab, d_model=512, nhead=8, num_encoder_layers=12, num_decoder_layers=0, d_ffn=2048, dropout=0.1, activation=nn.ReLU, positional_encoding="fixed_abs_sine", normalize_before=False, d_embedding=None, max_length=2500, causal=True, attention_type="regularMHA", ): super().__init__( d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, d_ffn=d_ffn, dropout=dropout, activation=activation, positional_encoding=positional_encoding, normalize_before=normalize_before, max_length=max_length, causal=causal, attention_type=attention_type, ) self.d_embedding = d_embedding if d_embedding is None: self.d_embedding = d_model self.custom_src_module = NormalizedEmbedding(self.d_embedding, vocab) self.embedding_proj = None if d_embedding is not None: self.embedding_proj = Linear(input_size=self.d_embedding, n_neurons=d_model) self.output_proj = ModuleList( Linear(input_size=d_model, n_neurons=d_model), LayerNorm(d_model, eps=1e-6), Linear(input_size=d_model, n_neurons=vocab), ) self.num_encoder_layers = num_encoder_layers self.num_decoder_layers = num_decoder_layers # reset the params of the transformer model self._reset_params()
def __init__( self, d_model, d_ffn, nhead, kernel_size, kdim=None, vdim=None, activation=Swish, bias=True, dropout=0.1, ): super().__init__() self.Multihead_attn = MultiheadAttention( nhead=nhead, d_model=d_model, dropout=dropout, kdim=kdim, vdim=vdim, ) self.convolution_module = ConvolutionModule(d_model, kernel_size, bias, activation, dropout) self.ffn_module = nn.Sequential( nn.LayerNorm(d_model), PositionalwiseFeedForward( d_ffn=d_ffn, input_size=d_model, dropout=dropout, activation=activation, ), nn.Dropout(dropout), ) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.drop = nn.Dropout(dropout)
def __init__( self, num_layers, nhead, d_ffn, input_shape=None, d_model=None, kdim=None, vdim=None, dropout=0.1, activation=Swish, kernel_size=31, bias=True, ): super().__init__() if input_shape is None and d_model is None: raise ValueError("Expected one of input_shape or d_model") if input_shape is not None and d_model is None: if len(input_shape) == 3: msg = "Input shape of the Transformer must be (batch, time, fea). Please revise the forward function in TransformerInterface to handel arbitary shape of input." raise ValueError(msg) d_model = input_shape[-1] self.layers = torch.nn.ModuleList( [ ConformerEncoderLayer( d_ffn=d_ffn, nhead=nhead, d_model=d_model, kdim=kdim, vdim=vdim, dropout=dropout, activation=activation, kernel_size=kernel_size, bias=bias, ) for i in range(num_layers) ] ) self.norm = LayerNorm(d_model, eps=1e-6)
def __init__( self, num_layers, d_model, d_ffn, nhead, kernel_size=31, kdim=None, vdim=None, activation=Swish, bias=True, dropout=0.0, causal=False, attention_type="RelPosMHAXL", ): super().__init__() self.layers = torch.nn.ModuleList( [ ConformerEncoderLayer( d_ffn=d_ffn, nhead=nhead, d_model=d_model, kdim=kdim, vdim=vdim, dropout=dropout, activation=activation, kernel_size=kernel_size, bias=bias, causal=causal, attention_type=attention_type, ) for i in range(num_layers) ] ) self.norm = LayerNorm(d_model, eps=1e-6)
def __init__( self, d_model, d_ffn, nhead, kernel_size, kdim=None, vdim=None, activation=Swish, bias=True, dropout=0.0, causal=True, attention_type="RelPosMHAXL", ): super().__init__() if not causal: warnings.warn( "Decoder is not causal, in most applications it should be causal, you have been warned !" ) if attention_type == "regularMHA": self.mha_layer = MultiheadAttention( nhead=nhead, d_model=d_model, dropout=dropout, kdim=kdim, vdim=vdim, ) elif attention_type == "RelPosMHAXL": # transformerXL style positional encoding self.mha_layer = RelPosMHAXL( num_heads=nhead, embed_dim=d_model, dropout=dropout, mask_pos_future=causal, ) self.convolution_module = ConvolutionModule( d_model, kernel_size, bias, activation, dropout, causal=causal ) self.ffn_module1 = nn.Sequential( nn.LayerNorm(d_model), PositionalwiseFeedForward( d_ffn=d_ffn, input_size=d_model, dropout=dropout, activation=activation, ), nn.Dropout(dropout), ) self.ffn_module2 = nn.Sequential( nn.LayerNorm(d_model), PositionalwiseFeedForward( d_ffn=d_ffn, input_size=d_model, dropout=dropout, activation=activation, ), nn.Dropout(dropout), ) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.drop = nn.Dropout(dropout)