예제 #1
0
    def __init__(self, params, train, **kwargs):
        super(DecoderStack, self).__init__(**kwargs)
        self.param = params
        with self.name_scope():
            self.layer = nn.Sequential()
            with self.layer.name_scope():
                for i in range(params.num_hidden_layers):
                    self_attention_layer = attention_layer.SelfAttention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    enc_dec_attention_layer = attention_layer.Attention(
                        params.hidden_size, params.num_heads,
                        params.attention_dropout, train)
                    feed_forward_network = fnn_layer.FeedForwardNetwork(
                        params.hidden_size, params.filter_size,
                        params.relu_dropout, train)

                    self.layer.add(
                        PrePostProcessingWrapper(self_attention_layer, params,
                                                 train),
                        PrePostProcessingWrapper(enc_dec_attention_layer,
                                                 params, train),
                        PrePostProcessingWrapper(feed_forward_network, params,
                                                 train))
            self.output_normalization = nn.LayerNorm(axis=-1, epsilon=1e-6)
예제 #2
0
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        self.batch_size = params.batch_size
        self.beam_size = params.beam_size
        mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
                                     value=params.num_hidden_layers)
        for _ in range(params.num_hidden_layers):
            self_attention_layer = attention_layer.SelfAttention(
                params.hidden_size, params.num_heads, params.attention_dropout,
                train)
            enc_dec_attention_layer = attention_layer.Attention(
                params.hidden_size, params.num_heads, params.attention_dropout,
                train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params.hidden_size, params.filter_size, params.relu_dropout,
                train)

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                         train),
                PrePostProcessingWrapper(feed_forward_network, params, train)
            ])

        self.output_normalization = LayerNormalization(params.hidden_size)
        self.encdec_cache = {}
        self.enc_out_cache = {}
예제 #3
0
 def __init__(self, params, is_train, mode):
     super(DecoderStack, self).__init__()
     self.mode = mode
     self.predict_one = ModeKeys.is_predict_one(self.mode)
     self.layers = []
     for _ in range(params.num_hidden_layers):
         self_attention_layer = attention_layer.SelfAttention(
             params.hidden_size, params.num_heads, params.attention_dropout,
             is_train, self.predict_one)
         if self.mode == ModeKeys.PREDICT_ONE_DECODER:
             enc_dec_attention_layer = attention_layer.EncDecPredictOneAttention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         else:
             enc_dec_attention_layer = attention_layer.Attention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params.hidden_size, params.filter_size, params.relu_dropout,
             is_train, self.predict_one)
         # decoder 包含3个模块,分别是self-attention,enc_dec_attention,以及feed-forward. 分别wrapper熵layer_norm和dropout.
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(feed_forward_network, params,
                                      is_train)
         ])
         self.output_normalization = LayerNormalization(params.hidden_size)
예제 #4
0
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params.hidden_size)
예제 #5
0
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

    self.layers.append([
        PrePostProcessingWrapper(self_attention_layer, params, train),
        PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    #self.output_normalization = LayerNormalization(params.hidden_size)
    self.output_normalization = tf.keras.layers.LayerNormalization(epsilon=0.000001, dtype=policy)
예제 #6
0
    def __init__(self, params, train):
        super(DecoderStack, self).__init__()
        self.layers = []
        for _ in range(params["num_hidden_layers"]):
            self_attention_layer = attention_layer.SelfAttention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            enc_dec_attention_layer = attention_layer.Attention(
                params["hidden_size"], params["num_heads"],
                params["attention_dropout"], train)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params["hidden_size"], params["filter_size"],
                params["relu_dropout"], train, params["allow_ffn_pad"])

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params, train),
                PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
                PrePostProcessingWrapper(feed_forward_network, params, train)])

        self.output_normalization = LayerNormalization(params["hidden_size"])
예제 #7
0
  def __init__(self, params, train):
    super(DecoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    for _ in range(params.num_hidden_layers):
      # SSY 3.1  transformer/model/attention_layer.py Dense and matmul
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.2  transformer/model/attention_layer.py Dense and matmul
      enc_dec_attention_layer = attention_layer.Attention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 3.3 transformer/model/ffn_layer.py only Dense
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(enc_dec_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    self.output_normalization = LayerNormalization(params.hidden_size)
예제 #8
0
  def __init__(self, params, train):
    super(EncoderStack, self).__init__()
    self.layers = []
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_NUM_HIDDEN_LAYERS,
        value=params.num_hidden_layers)
    # SSY num_hidden_layers is 6 transformer/model/model_params.py
    for _ in range(params.num_hidden_layers):
      # Create sublayers for each layer.
      # only SelfAttention and ffn
      # SSY 2.1  transformer/model/attention_layer.py Dense and matmul
      self_attention_layer = attention_layer.SelfAttention(
          params.hidden_size, params.num_heads, params.attention_dropout, train)
      # SSY 2.2 transformer/model/ffn_layer.py only Dense
      feed_forward_network = ffn_layer.FeedFowardNetwork(
          params.hidden_size, params.filter_size, params.relu_dropout, train)

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params, train),
          PrePostProcessingWrapper(feed_forward_network, params, train)])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params.hidden_size)