def __init__(self, params, train): super(DecoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( # NOTYPO params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) proj_layer = tf.layers.Dense( # TODO 加了一层MLP,project the concatenated [encoder activations, hidden state] to the hidden size params["hidden_size"], use_bias=True, name="proj_layer") self.layers.append([ transformer.PrePostProcessingWrapper(self_attention_layer, params, train), transformer.PrePostProcessingWrapper(feed_forward_network, params, train), proj_layer ]) self.output_normalization = transformer.LayerNormalization( params["hidden_size"])
def __init__(self, params, train): super(EncoderStack, self).__init__() self.layers = [] for _ in range(params["num_hidden_layers"]): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params["hidden_size"], params["num_heads"], params["attention_dropout"], train) feed_forward_network = ffn_layer.FeedFowardNetwork( params["hidden_size"], params["filter_size"], params["relu_dropout"], train, params["allow_ffn_pad"]) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, train), PrePostProcessingWrapper(feed_forward_network, params, train)]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params["hidden_size"])