def __init__(self, args): super(TransformerLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning if hasattr(args, "attention_head_size"): attention_head_size = args.attention_head_size else: attention_head_size = args.hidden_size // args.heads_num has_bias = bool(1 - args.remove_transformer_bias) # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, attention_head_size, args.dropout, has_bias=has_bias) self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size, has_bias=has_bias) # Feed forward layer. if args.feed_forward == "gated": self.feed_forward = GatedFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) else: self.feed_forward = PositionwiseFeedForward( args.hidden_size, args.feedforward_size, args.hidden_act, has_bias) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size, has_bias=has_bias)
def __init__(self, args): super(GptBlock, self).__init__() # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act) self.layer_norm_2 = LayerNorm(args.hidden_size)
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) self.layer_norm_2 = LayerNorm(args.hidden_size) self.layer_norm_3 = LayerNorm(args.hidden_size) self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act)
def __init__(self, args): super(ISynthesizer, self).__init__() self.att = None self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size) if self.__class__.__name__ == 'ISynthesizer': raise Exception("ISynthesizer cannot be instantiated.")
def __init__(self, args): super(RelationAwareTransformerLayer, self).__init__() # Multi-headed self-attention. self.self_attn = RelationAwareMultiHeadedAttention( args.hidden_size, args.heads_num, args.dropout ) self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward( args.hidden_size, args.feedforward_size ) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size)
def __init__(self, args): super(TransformerDecoderLayer, self).__init__() self.layernorm_positioning = args.layernorm_positioning # Multi-headed self-attention. self.self_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.dropout_1 = nn.Dropout(args.dropout) self.layer_norm_1 = LayerNorm(args.hidden_size) # Multi-headed context-attention. self.context_attn = MultiHeadedAttention(args.hidden_size, args.heads_num, args.dropout) self.dropout_2 = nn.Dropout(args.dropout) self.layer_norm_2 = LayerNorm(args.hidden_size) # Feed forward layer. self.feed_forward = PositionwiseFeedForward(args.hidden_size, args.feedforward_size, args.hidden_act) self.dropout_3 = nn.Dropout(args.dropout) self.layer_norm_3 = LayerNorm(args.hidden_size)