Exemplo n.º 1
0
    def __init__(self, params):
        super(TransformerLanguageModel, self).__init__()

        self.model_type = 'transformer_lm'
        self.normalize_before = False
        self.smoothing = params['smoothing']
        self.vocab_size = params['vocab_size']
        self.num_blocks = params['num_blocks']

        self.embedding = nn.Embedding(self.vocab_size, params['d_model'])
        self.pos_embedding = PositionalEncoding(params['d_model'], 0.0)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                params['n_heads'], params['d_model'], params['ffn_units'],
                slf_attn_dropout_rate=0.0, ffn_dropout_rate=0.0,
                residual_dropout_rate=params['residual_dropout_rate'],
                normalize_before=False, concat_after=False, activation='glu',
                drop_head_rate=params['enc_drop_head']) for _ in range(self.num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(params['d_model'])

        self.output_project = nn.Linear(params['d_model'], self.vocab_size)

        if params['share_embedding']:
            self.output_project.weight = self.embedding.weight
            print('Share the weight of embedding to the output project layer!')

        self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
Exemplo n.º 2
0
    def __init__(self,
                 input_size,
                 d_model=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 pos_dropout_rate=0.0,
                 slf_attn_dropout_rate=0.0,
                 ffn_dropout_rate=0.0,
                 residual_dropout_rate=0.1,
                 input_layer="conv2d",
                 normalize_before=True,
                 concat_after=False,
                 activation='relu',
                 type='transformer',
                 weight_sharing=False):
        super(TransformerEncoder, self).__init__()

        self.normalize_before = normalize_before
        self.weight_sharing = weight_sharing
        self.num_blocks = num_blocks

        if input_layer == "linear":
            self.embed = LinearWithPosEmbedding(input_size, d_model,
                                                pos_dropout_rate)
        elif input_layer == "conv2d":
            self.embed = Conv2dSubsampling(input_size, d_model,
                                           pos_dropout_rate)
        elif input_layer == 'conv2dv2':
            self.embed = Conv2dSubsamplingV2(input_size, d_model,
                                             pos_dropout_rate)

        if weight_sharing:
            num_blocks = 1
        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                attention_heads,
                d_model,
                linear_units,
                slf_attn_dropout_rate,
                ffn_dropout_rate,
                residual_dropout_rate=residual_dropout_rate,
                normalize_before=normalize_before,
                concat_after=concat_after,
                activation=activation) for _ in range(num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = LayerNorm(d_model)