Exemplo n.º 1
0
    def __init__(self, args):
        super(TransformerLayer, self).__init__()

        self.layernorm_positioning = args.layernorm_positioning

        if hasattr(args, "attention_head_size"):
            attention_head_size = args.attention_head_size
        else:
            attention_head_size = args.hidden_size // args.heads_num

        has_bias = bool(1 - args.remove_transformer_bias)

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size,
                                              args.heads_num,
                                              attention_head_size,
                                              args.dropout,
                                              has_bias=has_bias)
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size, has_bias=has_bias)
        # Feed forward layer.
        if args.feed_forward == "gated":
            self.feed_forward = GatedFeedForward(args.hidden_size,
                                                 args.feedforward_size,
                                                 args.hidden_act, has_bias)
        else:
            self.feed_forward = PositionwiseFeedForward(
                args.hidden_size, args.feedforward_size, args.hidden_act,
                has_bias)
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size, has_bias=has_bias)
    def __init__(self, args):
        super(GptBlock, self).__init__()

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
        self.layer_norm_2 = LayerNorm(args.hidden_size)
Exemplo n.º 3
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num, args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        self.layer_norm_2 = LayerNorm(args.hidden_size)
        self.layer_norm_3 = LayerNorm(args.hidden_size)
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
Exemplo n.º 4
0
    def __init__(self, args):
        super(ISynthesizer, self).__init__()

        self.att = None
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size)
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size)

        if self.__class__.__name__ == 'ISynthesizer':
            raise Exception("ISynthesizer cannot be instantiated.")
Exemplo n.º 5
0
    def __init__(self, args):
        super(RelationAwareTransformerLayer, self).__init__()

        # Multi-headed self-attention.
        self.self_attn = RelationAwareMultiHeadedAttention(
            args.hidden_size, args.heads_num, args.dropout
        )
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(
            args.hidden_size, args.feedforward_size
        )
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size)
Exemplo n.º 6
0
    def __init__(self, args):
        super(TransformerDecoder, self).__init__()
        self.layers_num = args.layers_num
        self.layernorm_positioning = args.layernorm_positioning
        self.relative_position_embedding = args.relative_position_embedding
        self.transformer_decoder = nn.ModuleList(
            [TransformerDecoderLayer(args) for _ in range(self.layers_num)]
        )
        if "deepspeed_checkpoint_activations" in args:
            self.deepspeed_checkpoint_activations = args.deepspeed_checkpoint_activations
            self.deepspeed_checkpoint_layers_num = args.deepspeed_checkpoint_layers_num
        else:
            self.deepspeed_checkpoint_activations = False

        has_bias = bool(1 - args.remove_transformer_bias)

        if self.layernorm_positioning == "pre":
            if args.layernorm == "t5":
                self.layer_norm = T5LayerNorm(args.hidden_size)
            else:
                self.layer_norm = LayerNorm(args.hidden_size)

        if self.relative_position_embedding:
            self.self_pos_emb = RelativePositionEmbedding(bidirectional=False, heads_num=args.heads_num,
                                                          num_buckets=args.relative_attention_buckets_num)
Exemplo n.º 7
0
    def __init__(self, args):
        super(TransformerEncoder, self).__init__()
        self.mask = args.mask
        self.layers_num = args.layers_num
        self.parameter_sharing = args.parameter_sharing
        self.factorized_embedding_parameterization = args.factorized_embedding_parameterization
        self.layernorm_positioning = args.layernorm_positioning
        self.relative_position_embedding = args.relative_position_embedding

        has_bias = bool(1 - args.remove_transformer_bias)

        if self.factorized_embedding_parameterization:
            self.linear = nn.Linear(args.emb_size, args.hidden_size)

        if self.parameter_sharing:
            self.transformer = TransformerLayer(args)
        else:
            self.transformer = nn.ModuleList(
                [TransformerLayer(args) for _ in range(self.layers_num)])
        if self.layernorm_positioning == "pre":
            self.layer_norm = LayerNorm(args.hidden_size, has_bias=has_bias)

        if self.relative_position_embedding:
            self.relative_pos_emb = RelativePositionEmbedding(
                bidirectional=True, heads_num=args.heads_num)
Exemplo n.º 8
0
    def __init__(self, args):
        super(TransformerEncoder, self).__init__()
        self.mask = args.mask
        self.layers_num = args.layers_num
        self.parameter_sharing = args.parameter_sharing
        self.factorized_embedding_parameterization = args.factorized_embedding_parameterization
        self.layernorm_positioning = args.layernorm_positioning
        self.relative_position_embedding = args.relative_position_embedding
        self.has_residual_attention = args.has_residual_attention
        if "deepspeed_checkpoint_activations" in args:
            self.deepspeed_checkpoint_activations = args.deepspeed_checkpoint_activations
            self.deepspeed_checkpoint_layers_num = args.deepspeed_checkpoint_layers_num
        else:
            self.deepspeed_checkpoint_activations = False

        has_bias = bool(1 - args.remove_transformer_bias)

        if self.factorized_embedding_parameterization:
            self.linear = nn.Linear(args.emb_size, args.hidden_size)

        if self.parameter_sharing:
            self.transformer = TransformerLayer(args)
        else:
            self.transformer = nn.ModuleList(
                [TransformerLayer(args) for _ in range(self.layers_num)]
            )
        if self.layernorm_positioning == "pre":
            if args.layernorm == "t5":
                self.layer_norm = T5LayerNorm(args.hidden_size)
            else:
                self.layer_norm = LayerNorm(args.hidden_size)

        if self.relative_position_embedding:
            self.relative_pos_emb = RelativePositionEmbedding(bidirectional=True, heads_num=args.heads_num,
                                                              num_buckets=args.relative_attention_buckets_num)
Exemplo n.º 9
0
 def __init__(self, args, vocab_size):
     super(WordEmbedding, self).__init__()
     self.remove_embedding_layernorm = args.remove_embedding_layernorm
     self.dropout = nn.Dropout(args.dropout)
     self.word_embedding = nn.Embedding(vocab_size, args.emb_size)
     if not self.remove_embedding_layernorm:
         self.layer_norm = LayerNorm(args.emb_size)
Exemplo n.º 10
0
 def __init__(self, args, vocab_size):
     super(TabEmbedding, self).__init__()
     self.dropout = nn.Dropout(args.dropout)
     self.max_length = 512
     self.word_embedding = nn.Embedding(vocab_size, args.emb_size)
     self.segment_embedding = nn.Embedding(3, args.emb_size)
     self.position_embedding = nn.Embedding(self.max_length, args.emb_size)
     self.layer_norm = LayerNorm(args.emb_size)
Exemplo n.º 11
0
 def __init__(self, args):
     super(TransformerDecoder, self).__init__()
     self.layers_num = args.layers_num
     self.layernorm_positioning = args.layernorm_positioning
     self.transformer_decoder = nn.ModuleList(
         [TransformerDecoderLayer(args) for _ in range(self.layers_num)])
     if self.layernorm_positioning == "pre":
         self.layer_norm = LayerNorm(args.hidden_size)
Exemplo n.º 12
0
 def __init__(self, args, vocab_size):
     super(WordPosEmbedding, self).__init__()
     self.remove_embedding_layernorm = args.remove_embedding_layernorm
     self.dropout = nn.Dropout(args.dropout)
     self.max_seq_length = args.max_seq_length
     self.word_embedding = nn.Embedding(vocab_size, args.emb_size)
     self.position_embedding = nn.Embedding(self.max_seq_length, args.emb_size)
     if not self.remove_embedding_layernorm:
         self.layer_norm = LayerNorm(args.emb_size)
Exemplo n.º 13
0
    def __init__(self, args, vocab_size):
        super(MlmTarget, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = args.hidden_size
        self.emb_size = args.emb_size
        self.factorized_embedding_parameterization = args.factorized_embedding_parameterization

        if self.factorized_embedding_parameterization:
            self.mlm_linear_1 = nn.Linear(args.hidden_size, args.emb_size)
            self.layer_norm = LayerNorm(args.emb_size)
            self.mlm_linear_2 = nn.Linear(args.emb_size, self.vocab_size)
        else:
            self.mlm_linear_1 = nn.Linear(args.hidden_size, args.hidden_size)
            self.layer_norm = LayerNorm(args.hidden_size)
            self.mlm_linear_2 = nn.Linear(args.hidden_size, self.vocab_size)

        self.softmax = nn.LogSoftmax(dim=-1)

        self.criterion = nn.NLLLoss()
Exemplo n.º 14
0
 def __init__(self, args, vocab_size):
     super(WordPosSegEmbedding, self).__init__()
     self.remove_embedding_layernorm = args.remove_embedding_layernorm
     self.dropout = nn.Dropout(args.dropout)
     self.max_length = 512
     self.word_embedding = nn.Embedding(vocab_size, args.emb_size)
     self.position_embedding = nn.Embedding(self.max_length, args.emb_size)
     self.segment_embedding = nn.Embedding(3, args.emb_size)
     has_bias = bool(1 - args.remove_embedding_layernorm_bias)
     if not self.remove_embedding_layernorm:
         self.layer_norm = LayerNorm(args.emb_size, has_bias=has_bias)
Exemplo n.º 15
0
    def __init__(self, args, vocab_size):
        super(MlmTarget, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = args.hidden_size

        self.mlm_linear_1 = nn.Linear(args.hidden_size, args.hidden_size)
        self.layer_norm = LayerNorm(args.hidden_size)
        self.mlm_linear_2 = nn.Linear(args.hidden_size, self.vocab_size)

        self.softmax = nn.LogSoftmax(dim=-1)

        self.criterion = nn.NLLLoss()
Exemplo n.º 16
0
    def __init__(self, args):
        super(TransformerDecoder, self).__init__()
        self.layers_num = args.layers_num
        self.layernorm_positioning = args.layernorm_positioning
        self.relative_position_embedding = args.relative_position_embedding
        self.transformer_decoder = nn.ModuleList(
            [TransformerDecoderLayer(args) for _ in range(self.layers_num)])
        if self.layernorm_positioning == "pre":
            self.layer_norm = LayerNorm(args.hidden_size)

        if self.relative_position_embedding:
            self.relative_pos_emb = RelativePositionEmbedding(
                bidirectional=False, heads_num=args.heads_num)
Exemplo n.º 17
0
    def __init__(self, args, vocab_size):
        super(BertTarget, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = args.hidden_size
        self.factorized_embedding_parameterization = args.factorized_embedding_parameterization
        self.act = str2act[args.hidden_act]

        # MLM.
        if self.factorized_embedding_parameterization:
            self.mlm_linear_1 = nn.Linear(args.hidden_size, args.emb_size)
            self.layer_norm = LayerNorm(args.emb_size)
            self.mlm_linear_2 = nn.Linear(args.emb_size, self.vocab_size)
        else:
            self.mlm_linear_1 = nn.Linear(args.hidden_size, args.hidden_size)
            self.layer_norm = LayerNorm(args.hidden_size)
            self.mlm_linear_2 = nn.Linear(args.hidden_size, self.vocab_size)

        # NSP.
        self.nsp_linear_1 = nn.Linear(args.hidden_size, args.hidden_size)
        self.nsp_linear_2 = nn.Linear(args.hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

        self.criterion = nn.NLLLoss()
Exemplo n.º 18
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.layernorm_positioning = args.layernorm_positioning

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)

        # Multi-headed context-attention.
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num, args.dropout)
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size)

        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
        self.dropout_3 = nn.Dropout(args.dropout)
        self.layer_norm_3 = LayerNorm(args.hidden_size)
Exemplo n.º 19
0
    def __init__(self, args):
        super(TransformerEncoder, self).__init__()
        self.mask = args.mask
        self.layers_num = args.layers_num
        self.parameter_sharing = args.parameter_sharing
        self.factorized_embedding_parameterization = args.factorized_embedding_parameterization
        self.layernorm_positioning = args.layernorm_positioning

        if self.factorized_embedding_parameterization:
            self.linear = nn.Linear(args.emb_size, args.hidden_size)

        if self.parameter_sharing:
            self.transformer = TransformerLayer(args)
        else:
            self.transformer = nn.ModuleList(
                [TransformerLayer(args) for _ in range(self.layers_num)])
        if self.layernorm_positioning == "pre":
            self.layer_norm = LayerNorm(args.hidden_size)
Exemplo n.º 20
0
    def __init__(self, args, vocab_size):
        super(AlbertTarget, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = args.hidden_size
        self.emb_size = args.emb_size
        self.act = str2act[args.hidden_act]

        # MLM.
        self.mlm_linear_1 = nn.Linear(args.hidden_size, args.emb_size)
        self.layer_norm = LayerNorm(args.emb_size)
        self.mlm_linear_2 = nn.Linear(args.emb_size, self.vocab_size)

        # SOP.
        self.sop_linear_1 = nn.Linear(args.hidden_size, args.hidden_size)
        self.sop_linear_2 = nn.Linear(args.hidden_size, 2)
        self.softmax = nn.LogSoftmax(dim=-1)

        self.criterion = nn.NLLLoss()
Exemplo n.º 21
0
 def __init__(self, args):
     super(Gpt2Encoder, self).__init__()
     self.layers_num = args.layers_num
     self.block = nn.ModuleList(
         [GptBlock(args) for _ in range(self.layers_num)])
     self.layer_norm = LayerNorm(args.hidden_size)