def __init__(self, params): super(TransformerLanguageModel, self).__init__() self.model_type = 'transformer_lm' self.normalize_before = False self.smoothing = params['smoothing'] self.vocab_size = params['vocab_size'] self.num_blocks = params['num_blocks'] self.embedding = nn.Embedding(self.vocab_size, params['d_model']) self.pos_embedding = PositionalEncoding(params['d_model'], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params['n_heads'], params['d_model'], params['ffn_units'], slf_attn_dropout_rate=0.0, ffn_dropout_rate=0.0, residual_dropout_rate=params['residual_dropout_rate'], normalize_before=False, concat_after=False, activation='glu', drop_head_rate=params['enc_drop_head']) for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params['d_model']) self.output_project = nn.Linear(params['d_model'], self.vocab_size) if params['share_embedding']: self.output_project.weight = self.embedding.weight print('Share the weight of embedding to the output project layer!') self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__(self, output_size, d_model=256, attention_heads=4, linear_units=2048, num_blocks=6, pos_dropout_rate=0.0, slf_attn_dropout_rate=0.0, src_attn_dropout_rate=0.0, ffn_dropout_rate=0.0, residual_dropout_rate=0.1, activation='relu', normalize_before=True, concat_after=False, share_embedding=False, weight_sharing=False): super(TransformerDecoder, self).__init__() self.normalize_before = normalize_before self.weight_sharing = weight_sharing self.num_blocks = num_blocks self.embedding = torch.nn.Embedding(output_size, d_model) self.pos_encoding = PositionalEncoding(d_model, pos_dropout_rate) if weight_sharing: num_blocks = 1 self.blocks = nn.ModuleList([ TransformerDecoderLayer(attention_heads, d_model, linear_units, slf_attn_dropout_rate, src_attn_dropout_rate, ffn_dropout_rate, residual_dropout_rate, normalize_before=normalize_before, concat_after=concat_after, activation=activation) for _ in range(num_blocks) ]) if self.normalize_before: self.after_norm = LayerNorm(d_model) self.output_layer = nn.Linear(d_model, output_size) if share_embedding: assert self.embedding.weight.size( ) == self.output_layer.weight.size() self.output_layer.weight = self.embedding.weight