Exemplo n.º 1
0
    def __init__(self,
                 vocab_size,
                 emb_dim=512,
                 hidden_size=512,
                 n_layers=8,
                 n_heads=8,
                 padding_idx=0,
                 dropout_rate=0.1):
        """
        __init__
        """
        super(TransformerEncoderModel, self).__init__()
        self.padding_idx = padding_idx
        self.token_embedding = nn.Embedding(vocab_size,
                                            emb_dim,
                                            padding_idx=padding_idx)
        max_pos_len = 3000
        self.pos_embedding = nn.Embedding(max_pos_len,
                                          emb_dim,
                                          padding_idx=padding_idx)

        self.dropout = nn.Dropout(p=dropout_rate)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(emb_dim, n_heads, dim_feedforward=hidden_size * 4, \
                                                dropout=0.1, activation='gelu', attn_dropout=0.1, act_dropout=0)
        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_encoder_layer, n_layers)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.apply(self.init_weights)
Exemplo n.º 2
0
 def __init__(self,
              vocab_size,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              hidden_act="gelu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=514,
              initializer_range=0.02,
              pad_token_id=1):
     super(ErnieMModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.initializer_range = initializer_range
     self.embeddings = ErnieMEmbeddings(vocab_size, hidden_size,
                                        hidden_dropout_prob,
                                        max_position_embeddings)
     encoder_layer = nn.TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         dim_feedforward=4 * hidden_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         normalize_before=False)
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = ErnieMPooler(hidden_size)
     self.apply(self.init_weights)
Exemplo n.º 3
0
 def __init__(self,
              vocab_size,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              intermediate_size=3072,
              hidden_act="relu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=512,
              type_vocab_size=2,
              pad_token_id=0,
              rotary_value=False,
              use_bias=False):
     super(RoFormerv2Model, self).__init__()
     self.pad_token_id = pad_token_id
     self.num_hidden_layers = num_hidden_layers
     self.use_bias = use_bias
     self.embeddings = RoFormerv2Embeddings(
         vocab_size,
         hidden_size,
         hidden_dropout_prob,
         type_vocab_size, )
     encoder_layer = TransformerEncoderLayerWithRotary(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         rotary_value=rotary_value,
         max_position_embeddings=max_position_embeddings)
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.apply(self.init_weights)
Exemplo n.º 4
0
 def __init__(self,
              vocab_size,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              intermediate_size=3072,
              hidden_act="gelu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=512,
              type_vocab_size=2,
              initializer_range=0.02,
              pad_token_id=0):
     super(ErnieModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.initializer_range = initializer_range
     weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
         mean=0.0, std=self.initializer_range))
     self.embeddings = ErnieEmbeddings(
         vocab_size, hidden_size, hidden_dropout_prob,
         max_position_embeddings, type_vocab_size, pad_token_id, weight_attr)
     encoder_layer = nn.TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         weight_attr=weight_attr, )
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = ErniePooler(hidden_size, weight_attr)
     self.apply(self.init_weights)
Exemplo n.º 5
0
 def __init__(self,
              vocab_size,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              intermediate_size=3072,
              hidden_act="gelu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=512,
              type_vocab_size=16,
              initializer_range=0.02,
              pad_token_id=0):
     super(BertModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.initializer_range = initializer_range
     self.embeddings = BertEmbeddings(vocab_size, hidden_size, hidden_dropout_prob, max_position_embeddings,
                                      type_vocab_size)
     encoder_layer = nn.TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0)
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = BertPooler(hidden_size)
     self.apply(self.init_weights)
Exemplo n.º 6
0
    def __init__(self, vocab_size, embedding_size, hidden_size,
                 num_hidden_layers, num_attention_heads, intermediate_size,
                 hidden_act, hidden_dropout_prob, attention_probs_dropout_prob,
                 max_position_embeddings, type_vocab_size, initializer_range,
                 pad_token_id):
        super(ElectraModel, self).__init__()
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range
        self.embeddings = ElectraEmbeddings(vocab_size, embedding_size,
                                            hidden_dropout_prob,
                                            max_position_embeddings,
                                            type_vocab_size)

        if embedding_size != hidden_size:
            self.embeddings_project = nn.Linear(embedding_size, hidden_size)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)

        self.init_weights()
Exemplo n.º 7
0
    def __init__(
            self,
            vocab_size,
            vocab_file,
            hidden_size=768,
            num_hidden_layers=12,
            num_attention_heads=12,
            intermediate_size=3072,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            max_position_embeddings=512,
            type_vocab_size=2,
            initializer_range=0.02,
            pad_token_id=0,
            do_lower_case=True,
            is_split_into_words=False,
            max_seq_len=512, ):
        super(FasterErnieModel, self).__init__()
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the "
                "vocabulary from a pretrained model please use "
                "`model = FasterErnieModel.from_pretrained(PRETRAINED_MODEL_NAME)`"
                .format(vocab_file))
        self.do_lower_case = do_lower_case
        self.vocab = self.load_vocabulary(vocab_file)
        self.max_seq_len = max_seq_len

        self.tokenizer = FasterTokenizer(
            self.vocab,
            do_lower_case=self.do_lower_case,
            is_split_into_words=is_split_into_words)
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range
        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
            mean=0.0, std=self.initializer_range))
        self.embeddings = ErnieEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, pad_token_id, weight_attr)
        # Avoid import error in global scope when using paddle <= 2.2.0, therefore
        # import FusedTransformerEncoderLayer in local scope.
        # FusedTransformerEncoderLayer is supported by paddlepaddle since 2.2.0, please
        # ensure the version >= 2.2.0
        from paddle.incubate.nn import FusedTransformerEncoderLayer
        encoder_layer = FusedTransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout_rate=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout_rate=attention_probs_dropout_prob,
            act_dropout_rate=0,
            weight_attr=weight_attr, )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
        self.pooler = ErniePooler(hidden_size, weight_attr)
        self.apply(self.init_weights)
Exemplo n.º 8
0
 def __init__(self):
     super(TestModel, self).__init__()
     encoder_layer = nn.TransformerEncoderLayer(312,
                                                12,
                                                1024,
                                                dropout=0.1,
                                                activation='gelu',
                                                attn_dropout=0.1,
                                                act_dropout=0)
     self.encoder = nn.TransformerEncoder(encoder_layer, 3)
     self.fc = nn.Linear(312, 3)
Exemplo n.º 9
0
 def __init__(self,
              vocab_size,
              vocab_file,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              intermediate_size=3072,
              hidden_act="gelu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=512,
              type_vocab_size=2,
              initializer_range=0.02,
              pad_token_id=0,
              do_lower_case=True,
              is_split_into_words=False,
              max_seq_len=128,
              pad_to_max_seq_len=False):
     super(PPMiniLMModel, self).__init__()
     if not os.path.isfile(vocab_file):
         raise ValueError(
             "Can't find a vocabulary file at path '{}'. To load the "
             "vocabulary from a pretrained model please use "
             "`model = PPMiniLMModel.from_pretrained(PRETRAINED_MODEL_NAME)`"
             .format(vocab_file))
     self.vocab = self.load_vocabulary(vocab_file)
     self.do_lower_case = do_lower_case
     self.max_seq_len = max_seq_len
     self.is_split_into_words = is_split_into_words
     self.pad_token_id = pad_token_id
     self.pad_to_max_seq_len = pad_to_max_seq_len
     self.initializer_range = initializer_range
     weight_attr = paddle.ParamAttr(
         initializer=nn.initializer.TruncatedNormal(
             mean=0.0, std=self.initializer_range))
     self.embeddings = PPMiniLMEmbeddings(vocab_size, hidden_size,
                                          hidden_dropout_prob,
                                          max_position_embeddings,
                                          type_vocab_size, pad_token_id,
                                          weight_attr)
     encoder_layer = nn.TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         weight_attr=weight_attr,
         normalize_before=False)
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = PPMiniLMPooler(hidden_size, weight_attr)
     self.apply(self.init_weights)
Exemplo n.º 10
0
    def __init__(self,
                 d_model=512,
                 nhead=8,
                 num_encoder_layers=6,
                 dim_feedforward=2048,
                 dropout=0.1,
                 activation="relu",
                 attn_dropout=None,
                 act_dropout=None,
                 normalize_before=False,
                 weight_attr=None,
                 bias_attr=None):
        """TransformerEncoder"""
        super(TransformerEncoder, self).__init__()

        if isinstance(bias_attr, (list, tuple)):
            if len(bias_attr) == 1:
                encoder_bias_attr = [bias_attr[0]] * 2
            elif len(bias_attr) == 2:
                encoder_bias_attr = bias_attr
            elif len(bias_attr) == 3:
                encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
            else:
                assert False, (
                    "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
                )
        else:
            encoder_bias_attr = bias_attr

        if isinstance(weight_attr, (list, tuple)):
            if len(weight_attr) == 1:
                encoder_weight_attr = [weight_attr[0]] * 2
            elif len(weight_attr) == 2:
                encoder_weight_attr = weight_attr
            elif len(weight_attr) == 3:
                encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
            else:
                assert False, (
                    "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
                )
        else:
            encoder_weight_attr = weight_attr

        encoder_layer = nn.TransformerEncoderLayer(
            d_model, nhead, dim_feedforward, dropout, activation, attn_dropout,
            act_dropout, normalize_before, encoder_weight_attr,
            encoder_bias_attr)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers,
                                             encoder_norm)

        self.d_model = d_model
        self.nhead = nhead
Exemplo n.º 11
0
    def __init__(self,
                 vocab_size,
                 embedding_size=128,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 pad_token_id=0,
                 use_content_summary=True,
                 content_summary_index=1,
                 cls_num=2):
        super(ErnieCtmModel, self).__init__()

        self.pad_token_id = pad_token_id
        self.content_summary_index = content_summary_index
        self.initializer_range = initializer_range
        self.embeddings = ErnieCtmEmbeddings(
            vocab_size,
            embedding_size,
            hidden_dropout_prob=hidden_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            padding_idx=pad_token_id,
            cls_num=cls_num)
        self.embedding_hidden_mapping_in = nn.Linear(embedding_size,
                                                     hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation="gelu",
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0)
        encoder_layer.activation = nn.GELU(approximate=True)

        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
        self.pooler = ErnieCtmPooler(hidden_size)

        self.use_content_summary = use_content_summary
        self.content_summary_index = content_summary_index
        if use_content_summary is True:
            self.feature_fuse = nn.Linear(hidden_size * 2, intermediate_size)
            self.feature_output = nn.Linear(intermediate_size, hidden_size)

        self.apply(self.init_weights)
Exemplo n.º 12
0
    def __init__(
        self,
        vocab_size,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act='relu',
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        normalize_before=False,
        max_position_embeddings=513,
        type_vocab_size=4,
        initializer_range=0.02,
        unk_token_id=17963,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=3,
        mask_token_id=3,
    ):
        super(UNIMOModel, self).__init__()
        self.unk_token_id = unk_token_id
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.mask_token_id = mask_token_id
        self.initializer_range = initializer_range

        self.embeddings = UNIMOEmbeddings(vocab_size, hidden_size,
                                          hidden_dropout_prob,
                                          max_position_embeddings,
                                          type_vocab_size)
        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0,
            normalize_before=normalize_before)

        self.encoder_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob)
        self.encoder = nn.TransformerEncoder(
            encoder_layer,
            num_hidden_layers,
        )

        self.apply(self.init_weights)
Exemplo n.º 13
0
 def __init__(
     self,
     vocab_size=23236,
     hidden_size=768,
     num_hidden_layers=12,
     num_attention_heads=12,
     intermediate_size=3072,
     hidden_act="gelu",
     hidden_dropout_prob=0.1,
     attention_probs_dropout_prob=0.1,
     max_position_embeddings=512,
     type_vocab_size=2,
     initializer_range=0.02,
     pad_token_id=0,
     pool_act="tanh",
     layer_norm_eps=1e-12,
     glyph_embedding_dim=1728,
     pinyin_map_len=32,
 ):
     super(ChineseBertModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.layer_norm_eps = layer_norm_eps
     self.initializer_range = initializer_range
     self.embeddings = FusionBertEmbeddings(
         vocab_size,
         hidden_size,
         pad_token_id,
         type_vocab_size,
         max_position_embeddings,
         pinyin_map_len,
         glyph_embedding_dim,
         layer_norm_eps,
         hidden_dropout_prob,
     )
     encoder_layer = nn.TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
     )
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = BertPooler(hidden_size, pool_act)
     self.apply(self.init_weights)
Exemplo n.º 14
0
    def __init__(self,
                 vocab_size,
                 embed_tokens=None,
                 pad_token_id=0,
                 d_model=1280,
                 num_encoder_layers=2,
                 encoder_attention_heads=32,
                 encoder_ffn_dim=5120,
                 dropout=0.1,
                 activation_function='gelu',
                 attention_dropout=0.0,
                 activation_dropout=0.0,
                 max_position_embeddings=128,
                 init_std=0.02,
                 scale_embedding=True,
                 normalize_before=True):
        super().__init__()
        self.init_std = init_std
        self.pad_token_id = pad_token_id
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(num_embeddings=vocab_size,
                                             embedding_dim=d_model,
                                             padding_idx=pad_token_id)
        self.embed_scale = math.sqrt(d_model) if scale_embedding else 1.0
        self.encoder_embed_positions = BlenderbotLearnedPositionalEmbedding(
            num_embeddings=max_position_embeddings, embedding_dim=d_model)

        self.encoder_dropout = nn.Dropout(dropout)
        self.encoder_layernorm = nn.LayerNorm(normalized_shape=d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=encoder_attention_heads,
            dim_feedforward=encoder_ffn_dim,
            dropout=dropout,
            activation=activation_function,
            attn_dropout=attention_dropout,
            act_dropout=activation_dropout,
            normalize_before=normalize_before)
        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer,
                                             num_layers=num_encoder_layers)

        self.apply(self.init_weights)
Exemplo n.º 15
0
    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 normalize_before=True,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 unk_token_id=0,
                 pad_token_id=0,
                 bos_token_id=1,
                 eos_token_id=2,
                 mask_token_id=30000,
                 role_type_size=None):
        super(UnifiedTransformerModel, self).__init__()
        self.unk_token_id = unk_token_id
        self.pad_token_id = pad_token_id
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id
        self.mask_token_id = mask_token_id
        self.initializer_range = initializer_range

        self.embeddings = UnifiedTransformerEmbeddings(
            vocab_size, hidden_size, hidden_dropout_prob,
            max_position_embeddings, type_vocab_size, role_type_size)
        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0,
            normalize_before=normalize_before)
        encoder_norm = nn.LayerNorm(hidden_size)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers,
                                             encoder_norm)
        self.apply(self.init_weights)
Exemplo n.º 16
0
 def __init__(
     self,
     vocab_size,
     embedding_size=768,
     hidden_size=768,
     num_hidden_layers=12,
     num_attention_heads=12,
     intermediate_size=3072,
     hidden_act="gelu",
     hidden_dropout_prob=0.1,
     attention_probs_dropout_prob=0.1,
     max_position_embeddings=1536,
     type_vocab_size=2,
     initializer_range=0.02,
     pad_token_id=0,
     pool_act="tanh",
     rotary_value=False,
 ):
     super(RoFormerModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.initializer_range = initializer_range
     if embedding_size != hidden_size:
         self.embeddings_project = nn.Linear(embedding_size, hidden_size)
     self.embeddings = RoFormerEmbeddings(
         vocab_size,
         embedding_size,
         hidden_dropout_prob,
         type_vocab_size,
     )
     encoder_layer = TransformerEncoderLayerWithRotary(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         rotary_value=rotary_value,
     )
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = RoFormerPooler(hidden_size, pool_act)
     self.apply(self.init_weights)
Exemplo n.º 17
0
    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=16,
                 initializer_range=0.02,
                 pad_token_id=0,
                 fit_size=768):
        super(TinyBertModel, self).__init__()
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range
        self.embeddings = BertEmbeddings(vocab_size, hidden_size,
                                         hidden_dropout_prob,
                                         max_position_embeddings,
                                         type_vocab_size)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0)

        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
        self.pooler = BertPooler(hidden_size)
        # fit_dense(s) means a hidden states' transformation from student to teacher.
        # `fit_denses` is used in v2 model, and `fit_dense` is used in other pretraining models.
        self.fit_denses = nn.LayerList([
            nn.Linear(hidden_size, fit_size)
            for i in range(num_hidden_layers + 1)
        ])
        self.fit_dense = nn.Linear(hidden_size, fit_size)
        self.apply(self.init_weights)
Exemplo n.º 18
0
    def __init__(
        self,
        vocab_size,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        layer_norm_eps=1e-12,
        max_position_embeddings=512,
        max_2d_position_embeddings=1024,
        type_vocab_size=16,
        initializer_range=0.02,
        pad_token_id=0,
        pool_act="tanh",
    ):
        super(LayoutLMModel, self).__init__()
        #self.config = kwargs
        self.num_hidden_layers = num_hidden_layers
        self.pad_token_id = pad_token_id
        self.initializer_range = initializer_range
        self.embeddings = LayoutLMEmbeddings(vocab_size, hidden_size,
                                             hidden_dropout_prob,
                                             max_position_embeddings,
                                             max_2d_position_embeddings,
                                             layer_norm_eps, pad_token_id,
                                             type_vocab_size)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size,
            num_attention_heads,
            intermediate_size,
            dropout=hidden_dropout_prob,
            activation=hidden_act,
            attn_dropout=attention_probs_dropout_prob,
            act_dropout=0)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
        self.pooler = LayoutLMPooler(hidden_size, pool_act)
        self.apply(self.init_weights)
Exemplo n.º 19
0
    def __init__(self,
                 embed_tokens,
                 vocab_size,
                 pad_token_id=1,
                 d_model=768,
                 num_encoder_layers=6,
                 encoder_attention_heads=12,
                 encoder_ffn_dim=3072,
                 dropout=0.1,
                 activation_function='gelu',
                 attention_dropout=0.1,
                 activation_dropout=0.1,
                 max_position_embeddings=1024,
                 init_std=0.02):
        super().__init__()
        self.d_model = d_model
        self.init_std = init_std
        self.pad_token_id = pad_token_id
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(vocab_size, d_model, pad_token_id)

        self.encoder_embed_positions = MBartLearnedPositionalEmbedding(
            max_position_embeddings, d_model, pad_token_id)

        self.encoder_dropout = nn.Dropout(dropout)
        self.encoder_layernorm_embedding = nn.LayerNorm(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=encoder_attention_heads,
            dim_feedforward=encoder_ffn_dim,
            dropout=dropout,
            activation=activation_function,
            attn_dropout=attention_dropout,
            act_dropout=activation_dropout,
            normalize_before=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers,
                                             nn.LayerNorm(d_model))
        self.apply(self.init_weights)
Exemplo n.º 20
0
    def __init__(self, vocab_size, type_size, max_position_seq_len, num_layers,
                 n_head, hidden_size, attn_dropout, act_dropout):
        super(NSP, self).__init__()

        self.n_head = n_head
        self.hidden_size = hidden_size

        self.word_embedding_layer = nn.Embedding(vocab_size, hidden_size)
        self.sent_embedding_layer = nn.Embedding(type_size, hidden_size)
        self.pos_embedding_layer = nn.Embedding(max_position_seq_len,
                                                hidden_size)

        encoder_layer = nn.TransformerEncoderLayer(
            hidden_size, n_head, hidden_size * 4, act_dropout, 'gelu',
            attn_dropout, act_dropout, 'True')
        encoder_norm = nn.LayerNorm(hidden_size)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers,
                                             encoder_norm)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 2)

        self.dropout_layer = nn.Dropout(act_dropout)
        self.tanh_layer = nn.Tanh()
        self.softmax = nn.Softmax()
Exemplo n.º 21
0
 def __init__(self,
              vocab_size,
              hidden_size=768,
              num_hidden_layers=12,
              num_attention_heads=12,
              intermediate_size=3072,
              hidden_act="gelu",
              hidden_dropout_prob=0.1,
              attention_probs_dropout_prob=0.1,
              max_position_embeddings=512,
              type_vocab_size=16,
              initializer_range=0.02,
              pad_token_id=0,
              num_partitions=1):
     super(BertModel, self).__init__()
     self.pad_token_id = pad_token_id
     self.initializer_range = initializer_range
     self.embeddings = BertEmbeddings(
         vocab_size, hidden_size, hidden_dropout_prob,
         max_position_embeddings, type_vocab_size, num_partitions)
     self.weight_attr = paddle.ParamAttr(
         initializer=paddle.fluid.initializer.ConstantInitializer(value=0.01))
     encoder_layer = TransformerEncoderLayer(
         hidden_size,
         num_attention_heads,
         intermediate_size,
         dropout=hidden_dropout_prob,
         activation=hidden_act,
         attn_dropout=attention_probs_dropout_prob,
         act_dropout=0,
         weight_attr=self.weight_attr,
         bias_attr=False,
         num_partitions=num_partitions)
     self.num_partitions = num_partitions
     self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
     self.pooler = BertPooler(hidden_size)
Exemplo n.º 22
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 max_length=256,
                 n_layer=6,
                 n_head=8,
                 d_model=512,
                 d_inner_hid=2048,
                 dropout=0.1,
                 weight_sharing=False,
                 bos_id=0,
                 eos_id=1,
                 waitk=-1):
        super(SimultaneousTransformer, self).__init__()
        self.trg_vocab_size = trg_vocab_size
        self.emb_dim = d_model
        self.bos_id = bos_id
        self.eos_id = eos_id
        self.dropout = dropout
        self.waitk = waitk
        self.n_layer = n_layer
        self.n_head = n_head
        self.d_model = d_model

        self.src_word_embedding = WordEmbedding(
            vocab_size=src_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
        self.src_pos_embedding = PositionalEmbedding(
            emb_dim=d_model, max_length=max_length+1)
        if weight_sharing:
            assert src_vocab_size == trg_vocab_size, (
                "Vocabularies in source and target should be same for weight sharing."
            )
            self.trg_word_embedding = self.src_word_embedding
            self.trg_pos_embedding = self.src_pos_embedding
        else:
            self.trg_word_embedding = WordEmbedding(
                vocab_size=trg_vocab_size, emb_dim=d_model, bos_id=self.bos_id)
            self.trg_pos_embedding = PositionalEmbedding(
                emb_dim=d_model, max_length=max_length+1)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_head,
            dim_feedforward=d_inner_hid,
            dropout=dropout,
            activation='relu',
            normalize_before=True,
            bias_attr=[False, True])
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer, num_layers=n_layer, norm=encoder_norm)

        decoder_layer = DecoderLayer(
            d_model=d_model,
            nhead=n_head,
            dim_feedforward=d_inner_hid,
            dropout=dropout,
            activation='relu',
            normalize_before=True,
            bias_attr=[False, False, True])
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = Decoder(
            decoder_layer=decoder_layer, num_layers=n_layer, norm=decoder_norm)

        if weight_sharing:
            self.linear = lambda x: paddle.matmul(
                x=x, y=self.trg_word_embedding.word_embedding.weight, transpose_y=True)
        else:
            self.linear = nn.Linear(
                in_features=d_model,
                out_features=trg_vocab_size,
                bias_attr=False)