def __init__( self, d_model, d_ff, cov_kernel_size, n_heads, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, conv_dropout=0.0, macaron_style=True, conv_first=False, ffn_scale=0.5, conv_bias=True, relative_positional=True, activation="glu", ): super(ConformerEncoderBlock, self).__init__() self.conv_first = conv_first self.macaron_style = macaron_style self.ffn_scale = ffn_scale self.relative_positional = relative_positional self.residual_dropout = residual_dropout if self.macaron_style: self.pre_ffn = PositionwiseFeedForward(d_model, d_ff, ffn_dropout, activation=activation) self.macaron_ffn_norm = nn.LayerNorm(d_model) if self.relative_positional: self.mha = MultiHeadedSelfAttentionWithRelPos( n_heads, d_model, slf_attn_dropout) else: self.mha = MultiHeadedSelfAttention(n_heads, d_model, slf_attn_dropout) self.mha_norm = nn.LayerNorm(d_model) self.conv = ConformerConvolutionModule(d_model, cov_kernel_size, conv_bias, conv_dropout) self.conv_norm = nn.LayerNorm(d_model) self.post_ffn = PositionwiseFeedForward(d_model, d_ff, ffn_dropout, activation=activation) self.post_ffn_norm = nn.LayerNorm(d_model) self.final_norm = nn.LayerNorm(d_model)
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention(temperature=np.power( d_k, 0.5), attn_dropout=dropout) self.layer_norm = nn.LayerNorm(d_model) self.fc = nn.Linear(n_head * d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout)
def __init__( self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, activation="relu", normalize_before=True, concat_after=False, share_embedding=False, ): super(TransformerDecoder, self).__init__() self.decoder_type = "transformer" self.normalize_before = normalize_before self.relative_positional = False self.d_model = d_model self.embedding = nn.Embedding(vocab_size, d_model) self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList( [ TransformerDecoderLayer( n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout, ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=False, activation=activation, ) for _ in range(n_blocks) ] ) if self.normalize_before: self.after_norm = nn.LayerNorm(d_model) self.output_layer = nn.Linear(d_model, vocab_size) if share_embedding: assert self.embedding.weight.size() == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight logger.info("Tie the weights between the embedding and output layer.")
def __init__( self, n_heads, d_model, d_ff, memory_dim, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerDecoderLayer, self).__init__() self.relative_positional = relative_positional if self.relative_positional: self.slf_attn = MultiHeadedSelfAttentionWithRelPos( n_heads, d_model, slf_attn_dropout ) else: self.slf_attn = MultiHeadedSelfAttention(n_heads, d_model, slf_attn_dropout) self.src_attn = MultiHeadedCrossAttention( n_heads, d_model, memory_dim, src_attn_dropout ) self.feed_forward = PositionwiseFeedForward( d_model, d_ff, ffn_dropout, activation ) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.norm3 = nn.LayerNorm(d_model) self.dropout1 = nn.Dropout(residual_dropout) self.dropout2 = nn.Dropout(residual_dropout) self.dropout3 = nn.Dropout(residual_dropout) self.normalize_before = normalize_before self.concat_after = concat_after if self.concat_after: self.concat_linear1 = nn.Linear(d_model * 2, d_model) self.concat_linear2 = nn.Linear(d_model * 2, d_model)
def __init__(self, hidden_size, intermediate_size, layer_norm_eps=1e-5, dropout=0): super(BertOutput, self).__init__() self.dense = nn.Linear(intermediate_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout)
def __init__( self, input_size, output_size, in_channel=1, mid_channel=32, out_channel=128, kernel_size=[[3, 3], [3, 3]], stride=[2, 2], dropout=0.0, act_func_type="relu", front_end_layer_norm=False, ): super(ConvFrontEnd, self).__init__() self.kernel_size = kernel_size self.stride = stride self.output_size = output_size self.act_func_type = act_func_type self.front_end_layer_norm = front_end_layer_norm assert isinstance(self.kernel_size, list) and len( self.kernel_size) == 2 assert isinstance(self.stride, list) and len(self.stride) == 2 self.conv1 = Conv2dLayer( input_size=input_size, in_channel=in_channel, out_channel=mid_channel, kernel_size=self.kernel_size[0], stride=self.stride[0], dropout=dropout, batch_norm=False, residual=False, act_func_type=act_func_type, ) self.conv2 = Conv2dLayer( self.conv1.output_size, in_channel=mid_channel, out_channel=out_channel, kernel_size=self.kernel_size[1], stride=self.stride[1], dropout=dropout, batch_norm=False, residual=False, act_func_type=act_func_type, ) self.conv_output_size = self.conv2.output_size * self.conv2.out_channel self.output_layer = nn.Linear(self.conv_output_size, self.output_size) if self.front_end_layer_norm: self.layer_norm = nn.LayerNorm(self.output_size)
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__( self, vocab_size, type_vocab_size, max_position_embeddings, hidden_size, hidden_dropout_prob, seq_length, ): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True) self.register_buffer( "position_ids", flow.arange(max_position_embeddings).unsqueeze(0) ) self.seq_length = seq_length
def __init__( self, d_model=256, n_heads=4, d_ff=2048, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerEncoder, self).__init__() self.normalize_before = normalize_before self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerEncoderLayer( n_heads, d_model, d_ff, slf_attn_dropout, ffn_dropout, residual_dropout=residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=relative_positional, activation=activation, ) for _ in range(n_blocks) ]) if self.normalize_before: self.norm = nn.LayerNorm(d_model)
def __init__(self, params): super(TransformerLanguageModel, self).__init__(params) self.model_type = "transformer_lm" self.normalize_before = False self.smoothing = params["smoothing"] self.vocab_size = params["vocab_size"] self.num_blocks = params["num_blocks"] self.embedding = nn.Embedding(self.vocab_size, params["d_model"]) self.pos_embedding = PositionalEncoding(params["d_model"], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params["n_heads"], params["d_model"], params["d_ff"], slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=params["residual_dropout"], normalize_before=False, concat_after=False, activation="glu", ) for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params["d_model"]) self.output_project = nn.Linear(params["d_model"], self.vocab_size) if params["share_embedding"]: self.output_project.weight = self.embedding.weight print("Share the weight of embedding to the output project layer!") self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__( self, vocab_size, max_position_embeddings, type_vocab_size, hidden_size, layer_norm_eps=1e-5, dropout=0, pad_token_id=0, position_embedding_type="absolute", ): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size, padding_idx=pad_token_id) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = position_embedding_type self.register_buffer( "position_ids", flow.arange(max_position_embeddings).expand((1, -1))) self.register_buffer( "token_type_ids", flow.zeros( self.position_ids.size(), dtype=flow.int64, device=self.position_ids.device, ), persistent=False, ) self.padding_idx = pad_token_id
def __init__( self, d_input, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1, pe_maxlen=5000, ): super(Encoder, self).__init__() # parameters self.d_input = d_input self.n_layers = n_layers self.n_head = n_head self.d_k = d_k self.d_v = d_v self.d_model = d_model self.d_inner = d_inner self.dropout_rate = dropout self.pe_maxlen = pe_maxlen self.n_layers = n_layers # use linear transformation with layer norm to replace input embedding self.linear_in = nn.Linear(d_input, d_model) self.layer_norm_in = nn.LayerNorm(d_model) self.positional_encoding = PositionalEncoding(d_model, max_len=pe_maxlen) self.dropout = nn.Dropout(dropout) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__(self, d_in, d_hid, dropout=0.1): super(PositionwiseFeedForwardUseConv, self).__init__() self.w_1 = nn.Conv1d(d_in, d_hid, 1) self.w_2 = nn.Conv1d(d_hid, d_in, 1) self.layer_norm = nn.LayerNorm(d_in) self.dropout = nn.Dropout(dropout)
def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob=0.1): super().__init__() self.dense = nn.Linear(intermediate_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True)
def __init__(self, config: Callable[..., None]) -> None: super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.transform_act_fn = get_activation(config.hidden_act) self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
def __init__(self, hidden_size, hidden_act=nn.GELU()): super().__init__() self.dense = nn.Linear(hidden_size, hidden_size) self.transform_act_fn = hidden_act self.LayerNorm = nn.LayerNorm(hidden_size)
def __init__(self, d_model, d_ff, dropout=0.1): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) self.layer_norm = nn.LayerNorm(d_model)
def __init__(self, hidden_size: int, hidden_dropout_prob: float = 0.1) -> None: super().__init__() self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-12) self.dropout = nn.Dropout(hidden_dropout_prob)
def __init__(self, hidden_size, layer_norm_eps=1e-5, dropout=0): super().__init__() self.dense = nn.Linear(hidden_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps) self.dropout = nn.Dropout(dropout)