def ffn_layer(x, hparams):
    """ffn layer transformer."""
    with tf.variable_scope("ffn"):
        if hparams.ffn_layer == "none":
            return x
        if hparams.ffn_layer == "conv_hidden_relu":
            y = common_layers.dense_relu_dense(x,
                                               hparams.filter_size,
                                               hparams.hidden_size,
                                               dropout=hparams.relu_dropout)
        elif hparams.ffn_layer == "normed_conv_hidden_relu":
            y = common_layers.normed_conv_hidden_relu(
                x,
                hparams.norm_type,
                hparams.layer_norm_epsilon,
                hparams.filter_size,
                hparams.hidden_size,
                dropout=hparams.relu_dropout,
                norm_name="convnorm")
        elif hparams.ffn_layer == "self_attention_ffn":
            x_shape = tf.shape(x)
            x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
            y = common_attention.ffn_self_attention_layer(
                x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
                hparams.attention_dropout, hparams.share_kv)
            y = tf.reshape(y, x_shape)
        else:
            assert hparams.ffn_layer == "glu_ffn"
            y = common_layers.gated_linear_unit_layer(x)
        return y
Exemplo n.º 2
0
    def body(self, features):
        assert self._hparams.block_size > 0
        assert not common_layers.is_xla_compiled()
        assert "targets_segmentation" not in features

        decoder_output = super(TransformerBlockParallel, self).body(features)
        assert not isinstance(decoder_output, tuple)
        assert len(decoder_output.shape) == 4

        relu_dropout_broadcast_dims = (
            common_layers.comma_separated_string_to_integer_list(
                getattr(self._hparams, "relu_dropout_broadcast_dims", "")))

        with tf.variable_scope("block_size_%d" % self._hparams.block_size):
            block_output = common_layers.dense_relu_dense(
                decoder_output,
                self._hparams.block_size * self._hparams.filter_size,
                self._hparams.block_size * self._hparams.hidden_size,
                dropout=self._hparams.relu_dropout,
                dropout_broadcast_dims=relu_dropout_broadcast_dims)

        batch_size, length = common_layers.shape_list(decoder_output)[:2]
        block_output = tf.reshape(block_output, [
            batch_size, length, self._hparams.block_size,
            self._hparams.hidden_size
        ])

        block_output = common_layers.layer_postprocess(decoder_output,
                                                       block_output,
                                                       self._hparams)

        return block_output
def ffn_layer(x, hparams):
  """ffn layer transformer."""
  with tf.variable_scope("ffn"):
    if hparams.ffn_layer == "none":
      return x
    if hparams.ffn_layer == "conv_hidden_relu":
      y = common_layers.dense_relu_dense(
          x,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout)
    elif hparams.ffn_layer == "normed_conv_hidden_relu":
      y = common_layers.normed_conv_hidden_relu(
          x,
          hparams.norm_type,
          hparams.layer_norm_epsilon,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout,
          norm_name="convnorm")
    elif hparams.ffn_layer == "self_attention_ffn":
      x_shape = tf.shape(x)
      x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
      y = common_attention.ffn_self_attention_layer(
          x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
          hparams.attention_dropout, hparams.share_kv)
      y = tf.reshape(y, x_shape)
    else:
      assert hparams.ffn_layer == "glu_ffn"
      y = common_layers.gated_linear_unit_layer(x)
    return y
    def body(self, features):
        assert self._hparams.block_size > 0
        assert not common_layers.is_xla_compiled()

        hparams = copy.copy(self._hparams)
        targets = features["targets"]
        inputs = features["inputs"]
        if not (tf.get_variable_scope().reuse
                or hparams.mode == tf.estimator.ModeKeys.PREDICT):
            tf.summary.image("inputs", inputs, max_outputs=1)
            tf.summary.image("targets", targets, max_outputs=1)

        encoder_input = cia.prepare_encoder(inputs, hparams)
        encoder_output = cia.transformer_encoder_layers(
            encoder_input,
            hparams.num_encoder_layers,
            hparams,
            attention_type=hparams.enc_attention_type,
            name="encoder")
        decoder_input, rows, cols = cia.prepare_decoder(targets, hparams)
        decoder_output = cia.transformer_decoder_layers(
            decoder_input,
            encoder_output,
            hparams.num_decoder_layers,
            hparams,
            attention_type=hparams.dec_attention_type,
            name="decoder")

        assert not isinstance(decoder_output, tuple)
        assert len(decoder_output.shape) == 4

        relu_dropout_broadcast_dims = (
            common_layers.comma_separated_string_to_integer_list(
                getattr(self._hparams, "relu_dropout_broadcast_dims", "")))

        with tf.variable_scope("block_size_%d" % self._hparams.block_size):
            tf.logging.info("Using block_size %d", self._hparams.block_size)
            block_output = common_layers.dense_relu_dense(
                decoder_output,
                self._hparams.block_size * self._hparams.filter_size,
                self._hparams.block_size * self._hparams.hidden_size,
                dropout=self._hparams.relu_dropout,
                dropout_broadcast_dims=relu_dropout_broadcast_dims)

        batch_size, rows, cols = common_layers.shape_list(decoder_output)[:3]
        decoder_output = tf.reshape(
            decoder_output,
            [batch_size, rows, cols, 1, self._hparams.hidden_size])
        block_output = tf.reshape(block_output, [
            batch_size, rows, cols, self._hparams.block_size,
            self._hparams.hidden_size
        ])

        block_output = common_layers.layer_postprocess(decoder_output,
                                                       block_output,
                                                       self._hparams)

        return block_output
Exemplo n.º 5
0
def image_encoder(image_feat,
                  hparams,
                  name="image_encoder",
                  save_weights_to=None,
                  make_image_summary=True):
    """A stack of self attention layers."""

    x = image_feat
    with tf.variable_scope(name):
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = vqa_layers.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        None,
                        hparams.attention_key_channels
                        or hparams.image_hidden_size,
                        hparams.attention_value_channels
                        or hparams.image_hidden_size,
                        hparams.image_hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        save_weights_to=save_weights_to,
                        max_relative_position=None,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=None,
                        max_length=None,
                        vars_3d=False,
                        scale_otproduct=hparams.scale_dotproduct)
                    utils.collect_named_outputs("norms",
                                                "image_feat_self_attention",
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "image_feat_self_attention_zero_add",
                        tf.norm(x, axis=-1))
                with tf.variable_scope("ffn"):
                    y = common_layers.dense_relu_dense(
                        common_layers.layer_preprocess(x, hparams),
                        hparams.image_filter_size,
                        hparams.image_hidden_size,
                        dropout=hparams.relu_dropout,
                        dropout_broadcast_dims=None)
                    utils.collect_named_outputs("norms", "image_feat_ffn",
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs("norms",
                                                "image_feat_ffn_zero_add",
                                                tf.norm(x, axis=-1))
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
Exemplo n.º 6
0
def question_encoder(question,
                     question_self_attention_bias,
                     hparams,
                     name="question_encoder",
                     save_weights_to=None,
                     make_image_summary=True):
    """A stack of self attention layers."""
    x = question
    with tf.variable_scope(name):
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = vqa_layers.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        question_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.question_self_attention_type,
                        block_length=hparams.block_length,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        scale_dotproduct=hparams.scale_dotproduct,
                    )
                    utils.collect_named_outputs(
                        "norms", "query_self_attention_%d" % (layer),
                        tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms",
                        "query_self_attention_postprocess_%d" % (layer),
                        tf.norm(x, axis=-1))
                with tf.variable_scope("ffn"):
                    y = common_layers.dense_relu_dense(
                        common_layers.layer_preprocess(x, hparams),
                        hparams.filter_size,
                        hparams.hidden_size,
                        dropout=hparams.relu_dropout,
                    )
                    utils.collect_named_outputs("norms",
                                                "query_ffn_%d" % (layer),
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "query_ffn_postprocess_%d" % (layer),
                        tf.norm(x, axis=-1))
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
Exemplo n.º 7
0
def image_encoder(image_feat,
                  hparams,
                  name="image_encoder",
                  save_weights_to=None,
                  make_image_summary=True):
  """A stack of self attention layers."""

  x = image_feat
  image_hidden_size = hparams.image_hidden_size or hparams.hidden_size
  image_filter_size = hparams.image_filter_size or hparams.filter_size
  with tf.variable_scope(name):
    for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = vqa_layers.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              None,
              hparams.attention_key_channels or image_hidden_size,
              hparams.attention_value_channels or image_hidden_size,
              image_hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.image_self_attention_type,
              save_weights_to=save_weights_to,
              make_image_summary=make_image_summary,
              scale_dotproduct=hparams.scale_dotproduct,
          )
          utils.collect_named_outputs(
              "norms", "image_feat_self_attention_%d"%(layer),
              tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs(
              "norms", "image_feat_self_attention_postprocess_%d"%(layer),
              tf.norm(x, axis=-1))
        with tf.variable_scope("ffn"):
          y = common_layers.dense_relu_dense(
              common_layers.layer_preprocess(x, hparams),
              image_filter_size,
              image_hidden_size,
              dropout=hparams.relu_dropout,
          )
          utils.collect_named_outputs(
              "norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs(
              "norms", "image_feat_ffn_postprocess_%d"%(layer),
              tf.norm(x, axis=-1))
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    return common_layers.layer_preprocess(x, hparams)
def ffn_layer(x, hparams, losses=None):
    """ffn layer transformer."""
    with tf.variable_scope("ffn"):
        if hparams.ffn_layer == "none":
            return x
        if hparams.ffn_layer == "conv_hidden_relu":
            y = common_layers.dense_relu_dense(x,
                                               hparams.filter_size,
                                               hparams.hidden_size,
                                               dropout=hparams.relu_dropout)
        elif hparams.ffn_layer == "normed_conv_hidden_relu":
            y = common_layers.normed_conv_hidden_relu(
                x,
                hparams.norm_type,
                hparams.layer_norm_epsilon,
                hparams.filter_size,
                hparams.hidden_size,
                dropout=hparams.relu_dropout,
                norm_name="convnorm")
        elif hparams.ffn_layer == "self_attention_ffn":
            x_shape = tf.shape(x)
            x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
            y = common_attention.ffn_self_attention_layer(
                x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
                hparams.attention_dropout, hparams.share_kv)
            y = tf.reshape(y, x_shape)
        elif hparams.ffn_layer == "local_moe_tpu":
            overhead = (hparams.moe_overhead_train
                        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
                        hparams.moe_overhead_eval)
            x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
            y, loss = expert_utils.local_moe_tpu(
                x,
                hparams.filter_size // 2,
                hparams.hidden_size,
                hparams.moe_num_experts,
                overhead=overhead,
                loss_coef=hparams.moe_loss_coef)
            if is_4d:
                y = tf.reshape(y, x_shape)
            if losses is None:
                raise ValueError(
                    "transformer_ffn_layer with type local_moe_tpu must pass in "
                    "a losses list")
            losses.append(loss)
        else:
            assert hparams.ffn_layer == "glu_ffn"
            y = common_layers.gated_linear_unit_layer(x)
        return y
def ffn_layer(x, hparams, losses=None):
  """ffn layer transformer."""
  with tf.variable_scope("ffn"):
    if hparams.ffn_layer == "none":
      return x
    if hparams.ffn_layer == "conv_hidden_relu":
      y = common_layers.dense_relu_dense(
          x,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout)
    elif hparams.ffn_layer == "normed_conv_hidden_relu":
      y = common_layers.normed_conv_hidden_relu(
          x,
          hparams.norm_type,
          hparams.layer_norm_epsilon,
          hparams.filter_size,
          hparams.hidden_size,
          dropout=hparams.relu_dropout,
          norm_name="convnorm")
    elif hparams.ffn_layer == "self_attention_ffn":
      x_shape = tf.shape(x)
      x = tf.reshape(x, [x_shape[0], -1, hparams.hidden_size])
      y = common_attention.ffn_self_attention_layer(
          x, hparams.filter_size, hparams.hidden_size, hparams.num_parts,
          hparams.attention_dropout, hparams.share_kv)
      y = tf.reshape(y, x_shape)
    elif hparams.ffn_layer == "local_moe_tpu":
      overhead = (hparams.moe_overhead_train
                  if hparams.mode == tf.estimator.ModeKeys.TRAIN
                  else hparams.moe_overhead_eval)
      x, x_shape, is_4d = maybe_reshape_4d_to_3d(x)
      y, loss = expert_utils.local_moe_tpu(
          x, hparams.filter_size // 2,
          hparams.hidden_size,
          hparams.moe_num_experts, overhead=overhead,
          loss_coef=hparams.moe_loss_coef)
      if is_4d:
        y = tf.reshape(y, x_shape)
      if losses is None:
        raise ValueError(
            "transformer_ffn_layer with type local_moe_tpu must pass in "
            "a losses list")
      losses.append(loss)
    else:
      assert hparams.ffn_layer == "glu_ffn"
      y = common_layers.gated_linear_unit_layer(x)
    return y
Exemplo n.º 10
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None,
                          decode_loop_step=None,
                          readout_filter_size=0):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    readout_filter_size: if it's greater than 0, then it will be used instead of
      filter_size


  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
  ffn_layer = hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
        value={
            "filter_size": hparams.filter_size,
            "use_bias": "True",
            "activation": mlperf_log.RELU
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
        value={
            "hidden_size": hparams.hidden_size,
            "use_bias": "True",
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout)
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=hparams.conv_first_kernel,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout,
        cache=cache,
        decode_loop_step=decode_loop_step)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, readout_filter_size or hparams.filter_size,
        hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  elif ffn_layer == "sru":
    return common_layers.sru(x)
  elif ffn_layer == "local_moe_tpu":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe_tpu(
        x,
        hparams.filter_size // 2,
        hparams.hidden_size,
        hparams.moe_num_experts,
        overhead=overhead,
        loss_coef=hparams.moe_loss_coef)
  elif ffn_layer == "local_moe":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe(
        x,
        True,
        expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size],
                                   hparams.hidden_size),
        hparams.moe_num_experts,
        k=hparams.moe_k,
        hparams=hparams)
    losses.append(loss)
    return ret
  else:
    assert ffn_layer == "none"
    return x
Exemplo n.º 11
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None,
                          decode_loop_step=None,
                          readout_filter_size=0):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    readout_filter_size: if it's greater than 0, then it will be used instead of
      filter_size


  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
  ffn_layer = hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_FILTER_DENSE,
        value={
            "filter_size": hparams.filter_size,
            "use_bias": "True",
            "activation": mlperf_log.RELU
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_FFN_OUTPUT_DENSE,
        value={
            "hidden_size": hparams.hidden_size,
            "use_bias": "True",
        })
    mlperf_log.transformer_print(
        key=mlperf_log.MODEL_HP_RELU_DROPOUT, value=hparams.relu_dropout)
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=hparams.conv_first_kernel,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout,
        cache=cache,
        decode_loop_step=decode_loop_step)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, readout_filter_size or hparams.filter_size,
        hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        readout_filter_size or hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  elif ffn_layer == "sru":
    return common_layers.sru(x)
  elif ffn_layer == "local_moe_tpu":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe_tpu(
        x,
        hparams.filter_size // 2,
        hparams.hidden_size,
        hparams.moe_num_experts,
        overhead=overhead,
        loss_coef=hparams.moe_loss_coef)
  elif ffn_layer == "local_moe":
    overhead = (
        hparams.moe_overhead_train
        if hparams.mode == tf.estimator.ModeKeys.TRAIN else
        hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe(
        x,
        True,
        expert_utils.ffn_expert_fn(hparams.hidden_size, [hparams.filter_size],
                                   hparams.hidden_size),
        hparams.moe_num_experts,
        k=hparams.moe_k,
        hparams=hparams)
    losses.append(loss)
    return ret
  else:
    assert ffn_layer == "none"
    return x
Exemplo n.º 12
0
def decoder(decoder_input,
            encoder_output,
            decoder_self_attention_bias,
            encoder_decoder_attention_bias,
            hparams,
            name="decoder",
            save_weights_to=None,
            make_image_summary=True,):
  """A stack of transformer layers.

  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_self_attention_bias: bias Tensor for self-attention
      (see common_attention.attention_bias())
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors
  """
  x = decoder_input
  with tf.variable_scope(name):
    for layer in range(hparams.num_decoder_layers or hparams.num_hidden_layers):
      layer_name = "layer_%d" % layer
      with tf.variable_scope(layer_name):
        with tf.variable_scope("self_attention"):
          y = common_attention.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              decoder_self_attention_bias,
              hparams.attention_key_channels or hparams.hidden_size,
              hparams.attention_value_channels or hparams.hidden_size,
              hparams.hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.self_attention_type,
              save_weights_to=save_weights_to,
              make_image_summary=make_image_summary,
              )
          utils.collect_named_outputs("norms",
                                      "decoder_self_attention_%d"%(layer),
                                      tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs("norms",
                                      "decoder_self_attention_post_%d"%(layer),
                                      tf.norm(x, axis=-1))
        if encoder_output is not None:
          with tf.variable_scope("encdec_attention"):
            y = common_attention.multihead_attention(
                common_layers.layer_preprocess(x, hparams),
                encoder_output,
                encoder_decoder_attention_bias,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                save_weights_to=save_weights_to,
                make_image_summary=make_image_summary,
                )
            utils.collect_named_outputs(
                "norms",
                "decoder_encoder_attention_%d"%(layer),
                tf.norm(y, axis=-1))
            x = common_layers.layer_postprocess(x, y, hparams)
            utils.collect_named_outputs(
                "norms",
                "decoder_encoder_attention_post_%d"%(layer),
                tf.norm(x, axis=-1))
        with tf.variable_scope("ffn"):
          y = common_layers.dense_relu_dense(
              common_layers.layer_preprocess(x, hparams),
              hparams.filter_size,
              hparams.hidden_size,
              dropout=hparams.relu_dropout,
          )
          utils.collect_named_outputs("norms", "decoder_ffn_%d"%(layer),
                                      tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs("norms", "decoder_ffn_post_%d"%(layer),
                                      tf.norm(x, axis=-1))
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    return common_layers.layer_preprocess(x, hparams)
Exemplo n.º 13
0
def _ffn_layer_multi_inputs(inputs_list,
                            hparams,
                            ffn_layer_type="dense",
                            name="ffn",
                            kernel_initializer=None,
                            bias_initializer=None,
                            activation=None,
                            pad_remover=None,
                            preprocess=False,
                            postprocess=False):
    """Implements a Feed-forward layer with multiple inputs, pad-removing, etc.
  Args:
    inputs_list: list of input tensors
    hparams: hyper-parameters
    ffn_layer_type: dense / dense_dropconnect/ dense_relu_dense
    name: name
    kernel_initializer: kernel initializer
    bias_initializer: bias initializer
    activation: activation function
    pad_remover: pad remover
    preprocess: if preprocess the input
    postprocess: if postprocess the output
  Returns:
    a tensor
  Raises:
    ValueError: Unknown ffn_layer type.
  """

    # need at least one inputs
    num_inputs = len(inputs_list)
    assert num_inputs > 0

    if preprocess and num_inputs == 1:
        inputs_list[0] = common_layers.layer_preprocess(
            inputs_list[0], hparams)

    if postprocess:
        original_inputs = inputs_list[0]

    # the output size is the hidden size of the main inputs
    main_input = inputs_list[0]
    original_shape = common_layers.shape_list(main_input)
    assert hparams.hidden_size == common_layers.shape_list(main_input)[-1]

    # all the inputs are in the same shape with main inputs
    for inputs in inputs_list:
        main_input.get_shape().assert_is_compatible_with(inputs.get_shape())

    def remove_pads(x):
        original_shape = common_layers.shape_list(x)
        # Collapse `x` across examples, and remove padding positions.
        x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
        x = tf.expand_dims(pad_remover.remove(x), axis=0)
        return x

    if pad_remover:
        for i, inputs in enumerate(inputs_list):
            inputs_list[i] = remove_pads(inputs)

    ffn_inputs = (inputs_list[0] if len(inputs_list) == 1 else tf.concat(
        inputs_list, axis=-1))

    if ffn_layer_type == "dense":
        output = common_layers.dense(ffn_inputs,
                                     hparams.hidden_size,
                                     name=name,
                                     activation=activation,
                                     use_bias=True,
                                     kernel_initializer=kernel_initializer,
                                     bias_initializer=bias_initializer)

    elif ffn_layer_type == "dense_dropconnect":
        output = common_layers.dense_dropconnect(
            ffn_inputs,
            hparams.hidden_size,
            name=name,
            dropconnect_dropout=hparams.dropconnect_dropout,
            output_activation=activation)
        postprocess = False  # no dropout on the output unit

    elif ffn_layer_type == "dense_relu_dense":
        output = common_layers.dense_relu_dense(
            ffn_inputs,
            hparams.filter_size,
            hparams.hidden_size,
            name=name,
            dropout=hparams.relu_dropout,
            output_activation=activation,
        )

    else:
        raise ValueError("Unknown ffn_layer type: %s" % ffn_layer_type)

    if pad_remover:
        # Restore `output` to the original shape of `x`, including padding.
        output = tf.reshape(pad_remover.restore(tf.squeeze(output, axis=0)),
                            original_shape)

    if postprocess:
        if num_inputs == 1:
            output = common_layers.layer_postprocess(original_inputs, output,
                                                     hparams)
        else:  # only dropout (no residual)x
            hp = copy.copy(hparams)
            hp.layer_postprocess_sequence = hp.layer_postprocess_sequence.replace(
                "a", "")
            output = common_layers.layer_postprocess(original_inputs, output,
                                                     hp)

    return output
Exemplo n.º 14
0
def decoder(
    decoder_input,
    encoder_output,
    decoder_self_attention_bias,
    encoder_decoder_attention_bias,
    hparams,
    name="decoder",
    save_weights_to=None,
    make_image_summary=True,
):
    """A stack of transformer layers.

  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_self_attention_bias: bias Tensor for self-attention
      (see common_attention.attention_bias())
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    name: a string
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.

  Returns:
    y: a Tensors
  """
    x = decoder_input
    with tf.variable_scope(name):
        for layer in range(hparams.num_decoder_layers
                           or hparams.num_hidden_layers):
            layer_name = "layer_%d" % layer
            with tf.variable_scope(layer_name):
                with tf.variable_scope("self_attention"):
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        decoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                    )
                    utils.collect_named_outputs(
                        "norms", "decoder_self_attention_%d" % (layer),
                        tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "decoder_self_attention_post_%d" % (layer),
                        tf.norm(x, axis=-1))
                if encoder_output is not None:
                    with tf.variable_scope("encdec_attention"):
                        y = common_attention.multihead_attention(
                            common_layers.layer_preprocess(x, hparams),
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            save_weights_to=save_weights_to,
                            make_image_summary=make_image_summary,
                        )
                        utils.collect_named_outputs(
                            "norms", "decoder_encoder_attention_%d" % (layer),
                            tf.norm(y, axis=-1))
                        x = common_layers.layer_postprocess(x, y, hparams)
                        utils.collect_named_outputs(
                            "norms",
                            "decoder_encoder_attention_post_%d" % (layer),
                            tf.norm(x, axis=-1))
                with tf.variable_scope("ffn"):
                    y = common_layers.dense_relu_dense(
                        common_layers.layer_preprocess(x, hparams),
                        hparams.filter_size,
                        hparams.hidden_size,
                        dropout=hparams.relu_dropout,
                    )
                    utils.collect_named_outputs("norms",
                                                "decoder_ffn_%d" % (layer),
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "decoder_ffn_post_%d" % (layer),
                        tf.norm(x, axis=-1))
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
def hierarchical_attention_network_encoder(
        encoder_input,
        encoder_self_attention_bias,
        contexts,
        context_self_attention_biases,
        features,
        hparams,
        name="hierarchical_attention_network_encoder",
        save_weights_to=None,
        make_image_summary=True,
        losses=None):
    input_x = encoder_input
    context_xs = {}
    for context_name in contexts:
        context_xs[context_name] = contexts[context_name]
    context_paddings = {}
    context_nonpaddings = {}
    context_pad_removers = {}

    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        input_padding = common_attention.attention_bias_to_padding(
            encoder_self_attention_bias)
        input_nonpadding = 1.0 - input_padding
        for context_name in context_self_attention_biases:
            context_paddings[
                context_name] = common_attention.attention_bias_to_padding(
                    context_self_attention_biases[context_name])
            context_nonpaddings[
                context_name] = 1.0 - context_paddings[context_name]

        input_pad_remover = None
        for context_name in context_paddings:
            context_pad_removers[context_name] = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            input_pad_remover = expert_utils.PadRemover(input_padding)
            for context_name in context_paddings:
                context_pad_removers[context_name] = expert_utils.PadRemover(
                    context_paddings[context_name])

        temp_hparam = tf.contrib.training.HParams(
        )  # copy hparams except num_hidden_layers -> num_hidden_layers - 1
        for key, val in hparams.values().items():
            temp_hparam.add_hparam(key, val)
        temp_hparam.set_hparam("num_hidden_layers",
                               hparams.num_hidden_layers - 1)
        encoder_output = transformer_with_contexts_layers.transformer_encoder(
            input_x,
            encoder_self_attention_bias,
            temp_hparam,
            nonpadding=features_to_nonpadding(features, "inputs"),
            save_weights_to=save_weights_to,
            make_image_summary=make_image_summary)

        context_encoded_outputs = {}
        for context_name in context_xs:
            context_encoded_outputs[
                context_name] = transformer_with_contexts_layers.transformer_encoder(
                    context_xs[context_name],
                    context_self_attention_biases[context_name],
                    hparams,
                    nonpadding=features_to_nonpadding(features, context_name),
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary)

        with tf.variable_scope('word_abstraction', reuse=tf.AUTO_REUSE):
            encoder_word_level_query = common_layers.dense(
                encoder_output, hparams.hidden_size)  # q_w = f_w(h_t)
            encoder_word_level_abstraction = {}
            for context_name in context_encoded_outputs:
                encoder_word_level_abstraction[
                    context_name] = transformer_with_contexts_layers.multihead_attention(
                        common_layers.layer_preprocess(
                            encoder_word_level_query, hparams),
                        context_encoded_outputs[context_name],
                        context_self_attention_biases[context_name],
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        max_relative_position=hparams.max_relative_position,
                        dropout_broadcast_dims=attention_dropout_broadcast_dims,
                        max_length=hparams.get("max_length"),
                        vars_3d=hparams.get("attention_variables_3d"))  # s^j,

            sentence_information = tf.concat([
                encoder_word_level_abstraction[context_name]
                for context_name in encoder_word_level_abstraction
            ],
                                             axis=1)

        with tf.variable_scope('sentence_abstraction', reuse=tf.AUTO_REUSE):
            encoder_sentence_level_query = common_layers.dense(
                encoder_output, hparams.hidden_size)  # q_s = f_s(h_t)
            context_padding = common_attention.embedding_to_padding(
                sentence_information)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                context_padding)
            contextual_information = transformer_with_contexts_layers.multihead_attention(
                common_layers.layer_preprocess(encoder_sentence_level_query,
                                               hparams),
                sentence_information,
                ignore_padding,
                hparams.attention_key_channels or hparams.hidden_size,
                hparams.attention_value_channels or hparams.hidden_size,
                hparams.hidden_size,
                hparams.num_heads,
                hparams.attention_dropout,
                attention_type=hparams.self_attention_type,
                save_weights_to=save_weights_to,
                make_image_summary=make_image_summary,
                max_relative_position=hparams.max_relative_position,
                dropout_broadcast_dims=attention_dropout_broadcast_dims,
                max_length=hparams.get("max_length"),
                vars_3d=hparams.get("attention_variables_3d")
            )  # MultiHead(q_s, s^j), [batch, encoder_length, hidden_dim]

            contextual_information = common_layers.dense_relu_dense(
                contextual_information, hparams.filter_size,
                hparams.hidden_size)

        with tf.variable_scope('context_gating', reuse=tf.AUTO_REUSE):
            gate_lambda = tf.nn.sigmoid(
                common_layers.dense(contextual_information,
                                    hparams.hidden_size) +
                common_layers.dense(encoder_output, hparams.hidden_size))
            encoder_output = gate_lambda * encoder_output + (
                1 - gate_lambda) * contextual_information

    return common_layers.layer_preprocess(encoder_output, hparams)
Exemplo n.º 16
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None):
    """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparmeters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutoinal layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.

  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]
  """
    ffn_layer = hparams.ffn_layer
    if ffn_layer == "conv_hidden_relu":
        # Backwards compatibility
        ffn_layer = "dense_relu_dense"
    if ffn_layer == "dense_relu_dense":
        # In simple convolution mode, use `pad_remover` to speed up processing.
        if pad_remover:
            original_shape = common_layers.shape_list(x)
            # Collapse `x` across examples, and remove padding positions.
            x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
            x = tf.expand_dims(pad_remover.remove(x), axis=0)
        conv_output = common_layers.dense_relu_dense(
            x,
            hparams.filter_size,
            hparams.hidden_size,
            dropout=hparams.relu_dropout)
        if pad_remover:
            # Restore `conv_output` to the original shape of `x`, including padding.
            conv_output = tf.reshape(
                pad_remover.restore(tf.squeeze(conv_output, axis=0)),
                original_shape)
        return conv_output
    elif ffn_layer == "conv_relu_conv":
        return common_layers.conv_relu_conv(x,
                                            hparams.filter_size,
                                            hparams.hidden_size,
                                            first_kernel_size=3,
                                            second_kernel_size=1,
                                            padding=conv_padding,
                                            nonpadding_mask=nonpadding_mask,
                                            dropout=hparams.relu_dropout)
    elif ffn_layer == "parameter_attention":
        return common_attention.parameter_attention(
            x, hparams.parameter_attention_key_channels or hparams.hidden_size,
            hparams.parameter_attention_value_channels or hparams.hidden_size,
            hparams.hidden_size, hparams.filter_size, hparams.num_heads,
            hparams.attention_dropout)
    elif ffn_layer == "conv_hidden_relu_with_sepconv":
        return common_layers.conv_hidden_relu(x,
                                              hparams.filter_size,
                                              hparams.hidden_size,
                                              kernel_size=(3, 1),
                                              second_kernel_size=(31, 1),
                                              padding="LEFT",
                                              dropout=hparams.relu_dropout)
    else:
        assert ffn_layer == "none"
        return x
Exemplo n.º 17
0
def transformer_ffn_layer(x,
                          hparams,
                          customized_ffn=None,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None,
                          losses=None,
                          cache=None):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparameters for model
    customized_ffn: customized the ffn_layer string
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutional layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.
    losses: optional list onto which to append extra training losses
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.

  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]

  Raises:
    ValueError: If losses arg is None, but layer generates extra losses.
  """
  ffn_layer = customized_ffn or hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=hparams.conv_first_kernel,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout,
        cache=cache)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, hparams.filter_size, hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  elif ffn_layer == "sru":
    return common_layers.sru(x)
  elif ffn_layer == "local_moe_tpu":
    overhead = (hparams.moe_overhead_train
                if hparams.mode == tf.estimator.ModeKeys.TRAIN
                else hparams.moe_overhead_eval)
    ret, loss = expert_utils.local_moe_tpu(
        x, hparams.filter_size // 2,
        hparams.hidden_size,
        hparams.moe_num_experts, overhead=overhead,
        loss_coef=hparams.moe_loss_coef)
    if losses is None:
      raise ValueError(
          "transformer_ffn_layer with type local_moe_tpu must pass in "
          "a losses list")
    losses.append(loss)
    return ret
  else:
    assert ffn_layer == "none"
    return x
Exemplo n.º 18
0
def transformer_ffn_layer(x,
                          hparams,
                          pad_remover=None,
                          conv_padding="LEFT",
                          nonpadding_mask=None):
  """Feed-forward layer in the transformer.

  Args:
    x: a Tensor of shape [batch_size, length, hparams.hidden_size]
    hparams: hyperparmeters for model
    pad_remover: an expert_utils.PadRemover object tracking the padding
      positions. If provided, when using convolutional settings, the padding
      is removed before applying the convolution, and restored afterward. This
      can give a significant speedup.
    conv_padding: a string - either "LEFT" or "SAME".
    nonpadding_mask: an optional Tensor with shape [batch_size, length].
      needed for convolutoinal layers with "SAME" padding.
      Contains 1.0 in positions corresponding to nonpadding.

  Returns:
    a Tensor of shape [batch_size, length, hparams.hidden_size]
  """
  ffn_layer = hparams.ffn_layer
  relu_dropout_broadcast_dims = (
      common_layers.comma_separated_string_to_integer_list(
          getattr(hparams, "relu_dropout_broadcast_dims", "")))
  if ffn_layer == "conv_hidden_relu":
    # Backwards compatibility
    ffn_layer = "dense_relu_dense"
  if ffn_layer == "dense_relu_dense":
    # In simple convolution mode, use `pad_remover` to speed up processing.
    if pad_remover:
      original_shape = common_layers.shape_list(x)
      # Collapse `x` across examples, and remove padding positions.
      x = tf.reshape(x, tf.concat([[-1], original_shape[2:]], axis=0))
      x = tf.expand_dims(pad_remover.remove(x), axis=0)
    conv_output = common_layers.dense_relu_dense(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        dropout=hparams.relu_dropout,
        dropout_broadcast_dims=relu_dropout_broadcast_dims)
    if pad_remover:
      # Restore `conv_output` to the original shape of `x`, including padding.
      conv_output = tf.reshape(
          pad_remover.restore(tf.squeeze(conv_output, axis=0)), original_shape)
    return conv_output
  elif ffn_layer == "conv_relu_conv":
    return common_layers.conv_relu_conv(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        first_kernel_size=3,
        second_kernel_size=1,
        padding=conv_padding,
        nonpadding_mask=nonpadding_mask,
        dropout=hparams.relu_dropout)
  elif ffn_layer == "parameter_attention":
    return common_attention.parameter_attention(
        x, hparams.parameter_attention_key_channels or hparams.hidden_size,
        hparams.parameter_attention_value_channels or hparams.hidden_size,
        hparams.hidden_size, hparams.filter_size, hparams.num_heads,
        hparams.attention_dropout)
  elif ffn_layer == "conv_hidden_relu_with_sepconv":
    return common_layers.conv_hidden_relu(
        x,
        hparams.filter_size,
        hparams.hidden_size,
        kernel_size=(3, 1),
        second_kernel_size=(31, 1),
        padding="LEFT",
        dropout=hparams.relu_dropout)
  else:
    assert ffn_layer == "none"
    return x