Exemplo n.º 1
0
  def testBuildFutureMaskWithMaxLen(self):
    num_heads = 4
    length = [2, 4, 3]
    maximum_length = 5
    expected = [
        [[1.0, 0.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0]],
        [[1.0, 0.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 1.0, 0.0, 0.0],
         [1.0, 1.0, 1.0, 1.0, 0.0],
         [1.0, 1.0, 1.0, 1.0, 0.0]],
        [[1.0, 0.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 0.0, 0.0, 0.0],
         [1.0, 1.0, 1.0, 0.0, 0.0],
         [1.0, 1.0, 1.0, 0.0, 0.0],
         [1.0, 1.0, 1.0, 0.0, 0.0]]]

    mask = transformer.build_future_mask(
        tf.constant(length), num_heads=num_heads, maximum_length=maximum_length)

    with self.test_session() as sess:
      mask = sess.run(mask)
      mask = np.transpose(mask, (1, 0, 2, 3))
      for b in range(len(length)):
        self.assertAllEqual(expected, mask[b])
Exemplo n.º 2
0
    def testMaskedScaledDotAttention(self):
        batch_size = 3
        num_heads = 8
        queries_length = [8, 6, 10]
        depth = 20

        queries = tf.placeholder_with_default(
            np.random.randn(batch_size, num_heads, max(queries_length),
                            depth).astype(np.float32),
            shape=(None, num_heads, None, depth))

        mask = transformer.build_future_mask(queries_length,
                                             num_heads=num_heads)
        context, attn = transformer.dot_product_attention(
            queries,
            queries,
            queries,
            tf.estimator.ModeKeys.PREDICT,
            mask=mask)

        with self.test_session() as sess:
            context, attn = sess.run([context, attn])
            illegal_connections = np.triu_indices(max(queries_length), 1)
            for i in range(batch_size):
                for h in range(num_heads):
                    self.assertEqual(0.0,
                                     np.sum(attn[i, h][illegal_connections]))
    def _run(self,
             inputs,
             sequence_length=None,
             cache=None,
             memory=None,
             memory_sequence_length=None,
             step=None,
             training=None):
        # Process inputs.
        inputs *= self.num_units**0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs,
                                           position=step +
                                           1 if step is not None else None)
        inputs = common.dropout(inputs, self.dropout, training=training)

        # Prepare query mask.
        mask = None
        if sequence_length is not None:
            mask = transformer.build_future_mask(
                sequence_length, maximum_length=tf.shape(inputs)[1])

        # Prepare memory mask.
        memory_mask = None
        if memory is not None:
            if not isinstance(memory, (list, tuple)):
                memory = (memory, )
        if memory_sequence_length is not None:
            if not isinstance(memory_sequence_length, (list, tuple)):
                memory_sequence_length = (memory_sequence_length, )
            memory_mask = []
            for mem, mem_length in zip(memory, memory_sequence_length):
                mem_mask = tf.sequence_mask(mem_length,
                                            maxlen=tf.shape(mem)[1],
                                            dtype=tf.float32)
                mem_mask = tf.expand_dims(mem_mask, 1)
                memory_mask.append(mem_mask)

        # Run each layer.
        new_cache = []
        for i, layer in enumerate(self.layers):
            inputs, layer_cache, attention = layer(
                inputs,
                mask=mask,
                memory=memory,
                memory_mask=memory_mask,
                cache=cache[i] if cache is not None else None,
                training=training)
            new_cache.append(layer_cache)
        outputs = self.layer_norm(inputs)
        return outputs, new_cache, attention
Exemplo n.º 4
0
    def testBuildFutureMask(self):
        num_heads = 4
        length = [2, 4, 3]
        expected = [[[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
                     [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]],
                    [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
                    [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0, 0.0], [1.0, 1.0, 1.0, 0.0]]]

        mask = transformer.build_future_mask(tf.constant(length),
                                             num_heads=num_heads)
        mask = self.evaluate(mask)
        self.assertTupleEqual(mask.shape,
                              (len(length), 1, max(length), max(length)))
        self.assertAllEqual(np.squeeze(mask), expected)
Exemplo n.º 5
0
    def testBuildFutureMaskWithMaxLen(self):
        num_heads = 4
        length = [2, 4, 3]
        maximum_length = 5
        expected = [[[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0],
                     [1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0],
                     [1.0, 1.0, 0.0, 0.0, 0.0]],
                    [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0],
                     [1.0, 1.0, 1.0, 1.0, 0.0]],
                    [[1.0, 0.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0],
                     [1.0, 1.0, 1.0, 0.0, 0.0]]]

        mask = transformer.build_future_mask(tf.constant(length),
                                             num_heads=num_heads,
                                             maximum_length=maximum_length)

        with self.test_session() as sess:
            mask = sess.run(mask)
            self.assertTupleEqual(
                mask.shape, (len(length), 1, maximum_length, maximum_length))
            self.assertAllEqual(np.squeeze(mask), expected)
Exemplo n.º 6
0
  def testMaskedScaledDotAttention(self):
    batch_size = 3
    num_heads = 8
    queries_length = [8, 6, 10]
    depth = 20

    queries = tf.placeholder_with_default(
        np.random.randn(batch_size, num_heads, max(queries_length), depth).astype(np.float32),
        shape=(None, num_heads, None, depth))

    mask = transformer.build_future_mask(queries_length, num_heads=num_heads)
    context, attn = transformer.dot_product_attention(
        queries,
        queries,
        queries,
        tf.estimator.ModeKeys.PREDICT,
        mask=mask)

    with self.test_session() as sess:
      context, attn = sess.run([context, attn])
      illegal_connections = np.triu_indices(max(queries_length), 1)
      for i in range(batch_size):
        for h in range(num_heads):
          self.assertEqual(0.0, np.sum(attn[i, h][illegal_connections]))
    def _self_attention_stack(self,
                              inputs,
                              sequence_length=None,
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,
                              memory_sequence_length=None):
        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None

        if sequence_length is not None:
            decoder_mask = transformer.build_future_mask(
                sequence_length,
                num_heads=self.num_heads,
                maximum_length=tf.shape(inputs)[1],
                dtype=inputs.dtype)
        if memory is not None:
            if cache is not None:
                memory_mask = cache["memory_mask"]
            elif memory_sequence_length is not None:
                memory_mask = self._build_memory_mask(
                    memory, memory_sequence_length=memory_sequence_length)

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                with tf.variable_scope("masked_multi_head"):
                    encoded = transformer.multi_head_attention(
                        self.num_heads,
                        transformer.norm(inputs),
                        None,
                        mode,
                        num_units=self.num_units,
                        mask=decoder_mask,
                        cache=layer_cache,
                        dropout=self.attention_dropout)
                    encoded = transformer.drop_and_add(inputs,
                                                       encoded,
                                                       mode,
                                                       dropout=self.dropout)

                if memory is not None:
                    with tf.variable_scope("multi_head"):
                        context = transformer.multi_head_attention(
                            self.num_heads,
                            transformer.norm(encoded),
                            memory,
                            mode,
                            mask=memory_mask,
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        context = transformer.drop_and_add(
                            encoded, context, mode, dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        outputs = transformer.norm(inputs)
        return outputs
  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None,
                            step=None):
    inputs *= self.num_units**0.5
    if self.position_encoder is not None:
      if step is None:
        inputs = self.position_encoder(inputs, sequence_length=sequence_length)
      else:
        inputs = self.position_encoder.apply_one(inputs, step + 1)

    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None
    last_attention = None

    if self.self_attention_type == "scaled_dot":
      if sequence_length is not None:
        decoder_mask = transformer.build_future_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1])
    elif self.self_attention_type == "average":
      if cache is None:
        if sequence_length is None:
          sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
        decoder_mask = transformer.cumulative_average_mask(
            sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

    if memory is not None and memory_sequence_length is not None:
      memory_mask = transformer.build_sequence_mask(
          memory_sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(memory)[1])

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        if self.self_attention_type == "scaled_dot":
          with tf.variable_scope("masked_multi_head"):
            encoded = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(inputs),
                None,
                mode,
                num_units=self.num_units,
                mask=decoder_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            encoded = transformer.drop_and_add(
                inputs,
                encoded,
                mode,
                dropout=self.dropout)
        elif self.self_attention_type == "average":
          with tf.variable_scope("average_attention"):
            # Cumulative average.
            x = transformer.norm(inputs)
            y = transformer.cumulative_average(
                x, decoder_mask if cache is None else step, cache=layer_cache)
            # FFN.
            y = transformer.feed_forward(
                y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
            # Gating layer.
            z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
            i, f = tf.split(z, 2, axis=-1)
            y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
            encoded = transformer.drop_and_add(
                inputs, y, mode, dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context, last_attention = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout,
                return_attention=True)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)
        else:
          context = encoded

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    if last_attention is not None:
      # The first head of the last layer is returned.
      first_head_attention = last_attention[:, 0]
    else:
      first_head_attention = None

    outputs = transformer.norm(inputs)
    return outputs, first_head_attention
Exemplo n.º 9
0
  def _self_attention_stack(self,
                            inputs,
                            sequence_length=None,
                            mode=tf.estimator.ModeKeys.TRAIN,
                            cache=None,
                            memory=None,
                            memory_sequence_length=None):
    inputs = tf.layers.dropout(
        inputs,
        rate=self.dropout,
        training=mode == tf.estimator.ModeKeys.TRAIN)

    decoder_mask = None
    memory_mask = None

    if sequence_length is not None:
      decoder_mask = transformer.build_future_mask(
          sequence_length,
          num_heads=self.num_heads,
          maximum_length=tf.shape(inputs)[1],
          dtype=inputs.dtype)
    if memory is not None:
      if cache is not None:
        memory_mask = cache["memory_mask"]
      elif memory_sequence_length is not None:
        memory_mask = self._build_memory_mask(
            memory, memory_sequence_length=memory_sequence_length)

    for l in range(self.num_layers):
      layer_name = "layer_{}".format(l)
      layer_cache = cache[layer_name] if cache is not None else None
      with tf.variable_scope(layer_name):
        with tf.variable_scope("masked_multi_head"):
          encoded = transformer.multi_head_attention(
              self.num_heads,
              transformer.norm(inputs),
              None,
              mode,
              num_units=self.num_units,
              mask=decoder_mask,
              cache=layer_cache,
              dropout=self.attention_dropout)
          encoded = transformer.drop_and_add(
              inputs,
              encoded,
              mode,
              dropout=self.dropout)

        if memory is not None:
          with tf.variable_scope("multi_head"):
            context = transformer.multi_head_attention(
                self.num_heads,
                transformer.norm(encoded),
                memory,
                mode,
                mask=memory_mask,
                cache=layer_cache,
                dropout=self.attention_dropout)
            context = transformer.drop_and_add(
                encoded,
                context,
                mode,
                dropout=self.dropout)

        with tf.variable_scope("ffn"):
          transformed = transformer.feed_forward(
              transformer.norm(context),
              self.ffn_inner_dim,
              mode,
              dropout=self.relu_dropout)
          transformed = transformer.drop_and_add(
              context,
              transformed,
              mode,
              dropout=self.dropout)

        inputs = transformed

    outputs = transformer.norm(inputs)
    return outputs
Exemplo n.º 10
0
    def _self_attention_stack(self,
                              inputs,  # batch, max_dec_len, emb_dim
                              sequence_length=None,  # [batch]
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,  # [batch, enc_len, num_units]
                              memory_sequence_length=None,  # [batch]
                              step=None):
        inputs *= self.num_units ** 0.5
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs, position=step + 1 if step is not None else None)
            # inputs [batch, max_dec_len, emb_dim]
        inputs = tf.layers.dropout(  # batch, max_dec_len, emb_dim
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None
        last_attention = None

        if self.self_attention_type == "scaled_dot":
            if sequence_length is not None:  # sequence_length is None when decode, not None at train
                decoder_mask = transformer.build_future_mask(  # [batch, 1, max_dec_len, max_dec_len]
                    sequence_length,
                    num_heads=self.num_heads,
                    maximum_length=tf.shape(inputs)[1])
        elif self.self_attention_type == "average":
            if cache is None:
                if sequence_length is None:
                    sequence_length = tf.fill([tf.shape(inputs)[0]], tf.shape(inputs)[1])
                decoder_mask = transformer.cumulative_average_mask(
                    sequence_length, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype)

        if memory is not None and not tf.contrib.framework.nest.is_sequence(memory):
            memory = (memory,)
        if memory_sequence_length is not None:
            if not tf.contrib.framework.nest.is_sequence(memory_sequence_length):
                memory_sequence_length = (memory_sequence_length,)
            memory_mask = [  # [batch, 1, 1, enc_len]
                transformer.build_sequence_mask(
                    length, num_heads=self.num_heads, maximum_length=tf.shape(m)[1])
                for m, length in zip(memory, memory_sequence_length)]

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None  # train的时候没有cache,decode的时候有cache
            with tf.variable_scope(layer_name):
                # self attention encode the decoder input (training) or last step output (decode)
                if self.self_attention_type == "scaled_dot":
                    with tf.variable_scope("masked_multi_head"):
                        encoded = transformer.multi_head_attention(  # [batch, decode_len, hidden]
                            self.num_heads,
                            transformer.norm(inputs),
                            None,
                            mode,
                            num_units=self.num_units,
                            mask=decoder_mask,  # [batch, 1, len, len]
                            cache=layer_cache,
                            dropout=self.attention_dropout)
                        last_context = transformer.drop_and_add(  # [batch, decode_len, hidden]
                            inputs,
                            encoded,
                            mode,
                            dropout=self.dropout)
                elif self.self_attention_type == "average":
                    with tf.variable_scope("average_attention"):
                        # Cumulative average.
                        x = transformer.norm(inputs)
                        y = transformer.cumulative_average(
                            x, decoder_mask if cache is None else step, cache=layer_cache)
                        # FFN.
                        y = transformer.feed_forward(
                            y, self.ffn_inner_dim, mode, dropout=self.relu_dropout)
                        # Gating layer.
                        z = tf.layers.dense(tf.concat([x, y], -1), self.num_units * 2)
                        i, f = tf.split(z, 2, axis=-1)
                        y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                        last_context = transformer.drop_and_add(
                            inputs, y, mode, dropout=self.dropout)

                # attending to encoder memory using decoder context
                if memory is not None:
                    for i, (mem, mask) in enumerate(zip(memory, memory_mask)):
                        memory_cache = layer_cache["memory"][
                            i] if layer_cache is not None else None  # train的时候没有cache,decode的时候有cache
                        with tf.variable_scope("multi_head" if i == 0 else "multi_head_%d" % i):
                            context, last_attention = transformer.multi_head_attention(
                                self.num_heads,
                                transformer.norm(last_context),
                                mem,  # [batch, enc_len, dim]
                                mode,
                                mask=mask,  # [batch, 1, 1, len]
                                cache=memory_cache,
                                dropout=self.attention_dropout,
                                return_attention=True)
                            # context [batch, decode_len, num_units], last_attention train[batch, head, dec_len, enc_len]
                            last_context = transformer.drop_and_add(
                                last_context,  # [batch, decode_len, num_units]
                                context,
                                mode,
                                dropout=self.dropout)
                            if i > 0:  # Do not return attention in case of multi source.
                                last_attention = None

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(  # [batch, decode_len, num_units]
                        transformer.norm(last_context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(  # [batch, decode_len, num_units]
                        last_context,
                        transformed,
                        mode,
                        dropout=self.dropout)

                inputs = transformed

        if last_attention is not None:
            # The first head of the last layer is returned.
            first_head_attention = last_attention[:, 0]
        else:
            first_head_attention = None

        outputs = transformer.norm(inputs)  # [batch, decode_len, num_units]
        return outputs, first_head_attention