def __init__(self, d_model, d_ff, cov_kernel_size, n_heads, nblocks=12, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, conv_dropout=0.0, macaron_style=True, ffn_scale=0.5, conv_bias=True, relative_positional=True, activation='glu'): super(ConformerEncoder, self).__init__() self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ ConformerEncoderBlock(d_model, d_ff, cov_kernel_size, n_heads, slf_attn_dropout, ffn_dropout, residual_dropout, conv_dropout, macaron_style, ffn_scale, conv_bias, relative_positional, activation) for _ in range(nblocks) ]) self.output_size = d_model
def __init__(self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, activation='relu', normalize_before=True, concat_after=False, share_embedding=False): super(TransformerDecoder, self).__init__() self.decoder_type = 'transformer' self.normalize_before = normalize_before self.relative_positional = False self.d_model = d_model self.embedding = nn.Embedding(vocab_size, d_model) self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerDecoderLayer( n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout, ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=False, activation=activation) for _ in range(n_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(d_model) self.output_layer = nn.Linear(d_model, vocab_size) if share_embedding: assert self.embedding.weight.size() == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight logger.info('Tie the weights between the embedding and output layer.')
def __init__(self, params): super(TransformerLanguageModel, self).__init__(params) self.model_type = 'transformer_lm' self.normalize_before = False self.smoothing = params['smoothing'] self.vocab_size = params['vocab_size'] self.num_blocks = params['num_blocks'] self.embedding = nn.Embedding(self.vocab_size, params['d_model']) self.pos_embedding = PositionalEncoding(params['d_model'], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params['n_heads'], params['d_model'], params['d_ff'], slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=params['residual_dropout'], normalize_before=False, concat_after=False, activation='glu') for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params['d_model']) self.output_project = nn.Linear(params['d_model'], self.vocab_size) if params['share_embedding']: self.output_project.weight = self.embedding.weight print('Share the weight of embedding to the output project layer!') self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
class ConformerEncoder(BaseEncoder): def __init__(self, d_model, d_ff, cov_kernel_size, n_heads, nblocks=12, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, conv_dropout=0.0, macaron_style=True, ffn_scale=0.5, conv_bias=True, relative_positional=True, activation='glu'): super(ConformerEncoder, self).__init__() self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ ConformerEncoderBlock(d_model, d_ff, cov_kernel_size, n_heads, slf_attn_dropout, ffn_dropout, residual_dropout, conv_dropout, macaron_style, ffn_scale, conv_bias, relative_positional, activation) for _ in range(nblocks) ]) self.output_size = d_model def forward(self, inputs, mask): if self.relative_positional: enc_output = inputs # [1, 2T - 1] position = torch.arange(-(inputs.size(1) - 1), inputs.size(1), device=inputs.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: enc_output, pos = self.pos_emb(inputs) enc_output.masked_fill_(~mask.unsqueeze(2), 0.0) attn_weights = {} for i, block in enumerate(self.blocks): enc_output, attn_weight = block(enc_output, mask, pos) attn_weights['enc_block_%d' % i] = attn_weight return enc_output, mask, attn_weights
def __init__( self, d_model=256, n_heads=4, d_ff=2048, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerEncoder, self).__init__() self.normalize_before = normalize_before self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerEncoderLayer( n_heads, d_model, d_ff, slf_attn_dropout, ffn_dropout, residual_dropout=residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=relative_positional, activation=activation, ) for _ in range(n_blocks) ]) if self.normalize_before: self.norm = nn.LayerNorm(d_model)
class TransformerEncoder(nn.Module): def __init__( self, d_model=256, n_heads=4, d_ff=2048, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, normalize_before=False, concat_after=False, relative_positional=False, activation="relu", ): super(TransformerEncoder, self).__init__() self.normalize_before = normalize_before self.relative_positional = relative_positional self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerEncoderLayer( n_heads, d_model, d_ff, slf_attn_dropout, ffn_dropout, residual_dropout=residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=relative_positional, activation=activation, ) for _ in range(n_blocks) ]) if self.normalize_before: self.norm = nn.LayerNorm(d_model) def forward(self, inputs, mask): if self.relative_positional: enc_output = inputs position = flow.arange(-(inputs.size(1) - 1), inputs.size(1), device=inputs.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: enc_output, pos = self.pos_emb(inputs) attn_weights = {} for i, block in enumerate(self.blocks): enc_output, attn_weight = block(enc_output, mask.unsqueeze(1), pos) attn_weights["enc_block_%d" % i] = attn_weight if self.normalize_before: enc_output = self.norm(enc_output) return enc_output, mask, attn_weights
class ConformerEncoder(BaseEncoder): def __init__( self, d_model, d_ff, cov_kernel_size, n_heads, nblocks=12, pos_dropout=0.0, slf_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, conv_dropout=0.0, macaron_style=True, ffn_scale=0.5, conv_bias=True, positional_encoding=True, relative_positional=True, conv_first=False, activation="glu", ): super(ConformerEncoder, self).__init__() self.positional_encoding = positional_encoding self.relative_positional = relative_positional self.output_size = d_model if self.positional_encoding: self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ ConformerEncoderBlock( d_model, d_ff, cov_kernel_size, n_heads, slf_attn_dropout, ffn_dropout, residual_dropout, conv_dropout, macaron_style, conv_first, ffn_scale, conv_bias, relative_positional, activation, ) for _ in range(nblocks) ]) self.output_size = d_model def _pos_encoding(self, inputs): if self.relative_positional: enc_output = inputs position = flow.arange(-(inputs.size(1) - 1), inputs.size(1), device=inputs.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: enc_output, pos = self.pos_emb(inputs) return enc_output, pos def forward(self, inputs, mask): if self.positional_encoding: enc_output, pos = self._pos_encoding(inputs) else: enc_output = inputs pos = None attn_weights = {} for i, block in enumerate(self.blocks): enc_output, attn_weight = block(enc_output, mask, pos) attn_weights["enc_block_%d" % i] = attn_weight return enc_output, mask, attn_weights
class TransformerDecoder(nn.Module): def __init__(self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, activation='relu', normalize_before=True, concat_after=False, share_embedding=False): super(TransformerDecoder, self).__init__() self.decoder_type = 'transformer' self.normalize_before = normalize_before self.relative_positional = False self.d_model = d_model self.embedding = nn.Embedding(vocab_size, d_model) self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList([ TransformerDecoderLayer( n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout, ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=False, activation=activation) for _ in range(n_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(d_model) self.output_layer = nn.Linear(d_model, vocab_size) if share_embedding: assert self.embedding.weight.size() == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight logger.info('Tie the weights between the embedding and output layer.') def forward(self, targets, memory, memory_mask): dec_output = self.embedding(targets) if self.relative_positional: # [1, 2T - 1] position = torch.arange(-(dec_output.size(1)-1), dec_output.size(1), device=dec_output.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: dec_output, pos = self.pos_emb(dec_output) dec_mask = get_transformer_decoder_mask(targets) attn_weights = {} for i, block in enumerate(self.blocks): dec_output, attn_weight = block(dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos) attn_weights['dec_block_%d' % i] = attn_weight if self.normalize_before: dec_output = self.after_norm(dec_output) logits = self.output_layer(dec_output) return logits, attn_weights def inference(self, preds, memory, memory_mask=None, cache=None): assert preds.dim() == 2 # dec_output = self.embedding(preds) # dec_output, pos = self.pos_encoding.inference(dec_output) # mask = get_transformer_decoder_mask(preds) # new_caches = [] # attn_weights = {} # for i, block in enumerate(self.blocks): # block_cache = cache[i] if cache is not None else {'slf': None, 'src': None} # dec_output, attn_weight, block_cache = block.inference(dec_output, mask, memory, memory_mask.unsqueeze(1), pos, cache=block_cache) # attn_weights['dec_block_%d' % i] = attn_weight # new_caches.append(block_cache) # if self.normalize_before: # dec_output = self.after_norm(dec_output) # logits = self.output_layer(dec_output) # logits [batch_size, 1, model_size] logits, attn_weights = self.forward(preds, memory, memory_mask) log_probs = F.log_softmax(logits[:, -1, :], dim=-1) # logits [batch_size, 1, model_size] return log_probs, cache, attn_weights
class TransformerDecoder(nn.Module): def __init__( self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0, residual_dropout=0.1, activation="relu", normalize_before=True, concat_after=False, share_embedding=False, ): super(TransformerDecoder, self).__init__() self.decoder_type = "transformer" self.normalize_before = normalize_before self.relative_positional = False self.d_model = d_model self.embedding = nn.Embedding(vocab_size, d_model) self.pos_emb = PositionalEncoding(d_model, pos_dropout) self.blocks = nn.ModuleList( [ TransformerDecoderLayer( n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout, ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after, relative_positional=False, activation=activation, ) for _ in range(n_blocks) ] ) if self.normalize_before: self.after_norm = nn.LayerNorm(d_model) self.output_layer = nn.Linear(d_model, vocab_size) if share_embedding: assert self.embedding.weight.size() == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight logger.info("Tie the weights between the embedding and output layer.") def forward(self, targets, memory, memory_mask): dec_output = self.embedding(targets) if self.relative_positional: position = flow.arange( -(dec_output.size(1) - 1), dec_output.size(1), device=dec_output.device ).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: dec_output, pos = self.pos_emb(dec_output) dec_mask = get_transformer_decoder_mask(targets) attn_weights = {} for i, block in enumerate(self.blocks): dec_output, attn_weight = block( dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos ) attn_weights["dec_block_%d" % i] = attn_weight if self.normalize_before: dec_output = self.after_norm(dec_output) logits = self.output_layer(dec_output) return logits, attn_weights def inference(self, preds, memory, memory_mask=None, cache=None): assert preds.dim() == 2 logits, attn_weights = self.forward(preds, memory, memory_mask) logsoftmax = nn.LogSoftmax(dim=-1) log_probs = logsoftmax(logits[:, -1, :]) return log_probs, cache, attn_weights