示例#1
0
 def __init__(self, args):
     super(AttnEncoder, self).__init__()
     self.layers_num = args.layers_num
     self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                           args.dropout)
     self.self_attn = nn.ModuleList([
         MultiHeadedAttention(args.hidden_size, args.heads_num,
                              args.dropout) for _ in range(self.layers_num)
     ])
示例#2
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num, args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        self.layer_norm_2 = LayerNorm(args.hidden_size)
        self.layer_norm_3 = LayerNorm(args.hidden_size)
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
示例#3
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.layernorm_positioning = args.layernorm_positioning

        if hasattr(args, "attention_head_size"):
            attention_head_size = args.attention_head_size
        else:
            attention_head_size = args.hidden_size // args.heads_num

        has_bias = bool(1 - args.remove_transformer_bias)
        with_scale = bool(1 - args.remove_attention_scale)

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size,
                                              args.heads_num,
                                              attention_head_size,
                                              args.dropout,
                                              has_bias=has_bias,
                                              with_scale=with_scale)
        self.dropout_1 = nn.Dropout(args.dropout)

        # Multi-headed context-attention.
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num,
                                                 attention_head_size,
                                                 args.dropout,
                                                 has_bias=has_bias,
                                                 with_scale=with_scale)
        self.dropout_2 = nn.Dropout(args.dropout)

        # Feed forward layer.
        if args.feed_forward == "gated":
            self.feed_forward = GatedFeedForward(args.hidden_size,
                                                 args.feedforward_size,
                                                 args.hidden_act, has_bias)
        else:
            self.feed_forward = PositionwiseFeedForward(
                args.hidden_size, args.feedforward_size, args.hidden_act,
                has_bias)
        self.dropout_3 = nn.Dropout(args.dropout)

        # Layer Normalization
        if args.layernorm == "t5":
            self.layer_norm_1 = T5LayerNorm(args.hidden_size)
            self.layer_norm_2 = T5LayerNorm(args.hidden_size)
            self.layer_norm_3 = T5LayerNorm(args.hidden_size)
        else:
            self.layer_norm_1 = LayerNorm(args.hidden_size)
            self.layer_norm_2 = LayerNorm(args.hidden_size)
            self.layer_norm_3 = LayerNorm(args.hidden_size)
示例#4
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.layernorm_positioning = args.layernorm_positioning

        if hasattr(args, "attention_head_size"):
            attention_head_size = args.attention_head_size
        else:
            attention_head_size = args.hidden_size // args.heads_num

        has_bias = bool(1 - args.remove_transformer_bias)

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size,
                                              args.heads_num,
                                              attention_head_size,
                                              args.dropout,
                                              has_bias=has_bias)
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size, has_bias=has_bias)

        # Multi-headed context-attention.
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num,
                                                 attention_head_size,
                                                 args.dropout,
                                                 has_bias=has_bias)
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size, has_bias=has_bias)

        # Feed forward layer.
        if args.feed_forward == "gated":
            self.feed_forward = GatedFeedForward(args.hidden_size,
                                                 args.feedforward_size,
                                                 args.hidden_act, has_bias)
        else:
            self.feed_forward = PositionwiseFeedForward(
                args.hidden_size, args.feedforward_size, args.hidden_act,
                has_bias)
        self.dropout_3 = nn.Dropout(args.dropout)
        self.layer_norm_3 = LayerNorm(args.hidden_size, has_bias=has_bias)

        self.relative_pos_emb = None
        if args.relative_position_embedding:
            self.relative_pos_emb = RelativePositionEmbedding(
                bidirectional=False)
示例#5
0
 def __init__(self, args, input_size, labels_num):
     super(Classifier, self).__init__()
     self.input_size = input_size
     self.cla_hidden_size = 128
     self.cla_heads_num = 2
     self.labels_num = labels_num
     self.pooling = args.pooling
     self.output_layer_0 = nn.Linear(input_size, self.cla_hidden_size)
     self.self_atten = MultiHeadedAttention(self.cla_hidden_size, self.cla_heads_num, args.dropout)
     self.output_layer_1 = nn.Linear(self.cla_hidden_size, self.cla_hidden_size)
     self.output_layer_2 = nn.Linear(self.cla_hidden_size, labels_num)
    def __init__(self, args):
        super(GptBlock, self).__init__()

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)
        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
        self.layer_norm_2 = LayerNorm(args.hidden_size)
示例#7
0
    def __init__(self, args):
        super(TransformerDecoderLayer, self).__init__()

        self.layernorm_positioning = args.layernorm_positioning

        # Multi-headed self-attention.
        self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num,
                                              args.dropout)
        self.dropout_1 = nn.Dropout(args.dropout)
        self.layer_norm_1 = LayerNorm(args.hidden_size)

        # Multi-headed context-attention.
        self.context_attn = MultiHeadedAttention(args.hidden_size,
                                                 args.heads_num, args.dropout)
        self.dropout_2 = nn.Dropout(args.dropout)
        self.layer_norm_2 = LayerNorm(args.hidden_size)

        # Feed forward layer.
        self.feed_forward = PositionwiseFeedForward(args.hidden_size,
                                                    args.feedforward_size,
                                                    args.hidden_act)
        self.dropout_3 = nn.Dropout(args.dropout)
        self.layer_norm_3 = LayerNorm(args.hidden_size)