def __init__(self, args): super(AttnEncoder, self).__init__() self.layers_num = args.layers_num self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.self_attn = nn.ModuleList([ MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) for _ in range(self.layers_num) ])
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) self.layer_norm_2 = LayerNorm(args.hidden_size) self.layer_norm_3 = LayerNorm(args.hidden_size) self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act)
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning if hasattr(args, "attention_head_size"): attention_head_size = args.attention_head_size else: attention_head_size = args.hidden_size // args.heads_num has_bias = bool(1 - args.remove_transformer_bias) with_scale = bool(1 - args.remove_attention_scale) # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias, with_scale=with_scale) self.dropout_1 = nn.Dropout(args.dropout) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias, with_scale=with_scale) self.dropout_2 = nn.Dropout(args.dropout) # Feed forward layer. if args.feed_forward == "gated": self.feed_forward = GatedFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) else: self.feed_forward = PositionwiseFeedForward( args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) self.dropout_3 = nn.Dropout(args.dropout) # Layer Normalization if args.layernorm == "t5": self.layer_norm_1 = T5LayerNorm(args.hidden_size) self.layer_norm_2 = T5LayerNorm(args.hidden_size) self.layer_norm_3 = T5LayerNorm(args.hidden_size) else: self.layer_norm_1 = LayerNorm(args.hidden_size) self.layer_norm_2 = LayerNorm(args.hidden_size) self.layer_norm_3 = LayerNorm(args.hidden_size)
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning if hasattr(args, "attention_head_size"): attention_head_size = args.attention_head_size else: attention_head_size = args.hidden_size // args.heads_num has_bias = bool(1 - args.remove_transformer_bias) # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias) self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size, has_bias=has_bias) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size, has_bias=has_bias) # Feed forward layer. if args.feed_forward == "gated": self.feed_forward = GatedFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) else: self.feed_forward = PositionwiseFeedForward( args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) self.dropout_3 = nn.Dropout(args.dropout) self.layer_norm_3 = LayerNorm(args.hidden_size, has_bias=has_bias) self.relative_pos_emb = None if args.relative_position_embedding: self.relative_pos_emb = RelativePositionEmbedding( bidirectional=False)
def __init__(self, args, input_size, labels_num): super(Classifier, self).__init__() self.input_size = input_size self.cla_hidden_size = 128 self.cla_heads_num = 2 self.labels_num = labels_num self.pooling = args.pooling self.output_layer_0 = nn.Linear(input_size, self.cla_hidden_size) self.self_atten = MultiHeadedAttention(self.cla_hidden_size, self.cla_heads_num, args.dropout) self.output_layer_1 = nn.Linear(self.cla_hidden_size, self.cla_hidden_size) self.output_layer_2 = nn.Linear(self.cla_hidden_size, labels_num)
def __init__(self, args): super(GptBlock, self).__init__() # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act) self.layer_norm_2 = LayerNorm(args.hidden_size)
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act) self.dropout_3 = nn.Dropout(args.dropout) self.layer_norm_3 = LayerNorm(args.hidden_size)