예제 #1
0
def image_encoder(image_feat,
                  hparams,
                  name="image_encoder",
                  save_weights_to=None,
                  make_image_summary=True):
    """A stack of self attention layers."""

    x = image_feat
    with tf.variable_scope(name):
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = vqa_layers.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        None,
                        hparams.attention_key_channels
                        or hparams.image_hidden_size,
                        hparams.attention_value_channels
                        or hparams.image_hidden_size,
                        hparams.image_hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        save_weights_to=save_weights_to,
                        max_relative_position=None,
                        make_image_summary=make_image_summary,
                        dropout_broadcast_dims=None,
                        max_length=None,
                        vars_3d=False,
                        scale_otproduct=hparams.scale_dotproduct)
                    utils.collect_named_outputs("norms",
                                                "image_feat_self_attention",
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "image_feat_self_attention_zero_add",
                        tf.norm(x, axis=-1))
                with tf.variable_scope("ffn"):
                    y = common_layers.dense_relu_dense(
                        common_layers.layer_preprocess(x, hparams),
                        hparams.image_filter_size,
                        hparams.image_hidden_size,
                        dropout=hparams.relu_dropout,
                        dropout_broadcast_dims=None)
                    utils.collect_named_outputs("norms", "image_feat_ffn",
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs("norms",
                                                "image_feat_ffn_zero_add",
                                                tf.norm(x, axis=-1))
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
예제 #2
0
def question_encoder(question,
                     question_self_attention_bias,
                     hparams,
                     name="question_encoder",
                     save_weights_to=None,
                     make_image_summary=True):
    """A stack of self attention layers."""
    x = question
    with tf.variable_scope(name):
        for layer in range(hparams.num_encoder_layers
                           or hparams.num_hidden_layers):
            with tf.variable_scope("layer_%d" % layer):
                with tf.variable_scope("self_attention"):
                    y = vqa_layers.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        question_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.question_self_attention_type,
                        block_length=hparams.block_length,
                        save_weights_to=save_weights_to,
                        make_image_summary=make_image_summary,
                        scale_dotproduct=hparams.scale_dotproduct,
                    )
                    utils.collect_named_outputs(
                        "norms", "query_self_attention_%d" % (layer),
                        tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms",
                        "query_self_attention_postprocess_%d" % (layer),
                        tf.norm(x, axis=-1))
                with tf.variable_scope("ffn"):
                    y = common_layers.dense_relu_dense(
                        common_layers.layer_preprocess(x, hparams),
                        hparams.filter_size,
                        hparams.hidden_size,
                        dropout=hparams.relu_dropout,
                    )
                    utils.collect_named_outputs("norms",
                                                "query_ffn_%d" % (layer),
                                                tf.norm(y, axis=-1))
                    x = common_layers.layer_postprocess(x, y, hparams)
                    utils.collect_named_outputs(
                        "norms", "query_ffn_postprocess_%d" % (layer),
                        tf.norm(x, axis=-1))
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
예제 #3
0
def image_encoder(image_feat,
                  hparams,
                  name="image_encoder",
                  save_weights_to=None,
                  make_image_summary=True):
  """A stack of self attention layers."""

  x = image_feat
  image_hidden_size = hparams.image_hidden_size or hparams.hidden_size
  image_filter_size = hparams.image_filter_size or hparams.filter_size
  with tf.variable_scope(name):
    for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers):
      with tf.variable_scope("layer_%d" % layer):
        with tf.variable_scope("self_attention"):
          y = vqa_layers.multihead_attention(
              common_layers.layer_preprocess(x, hparams),
              None,
              None,
              hparams.attention_key_channels or image_hidden_size,
              hparams.attention_value_channels or image_hidden_size,
              image_hidden_size,
              hparams.num_heads,
              hparams.attention_dropout,
              attention_type=hparams.image_self_attention_type,
              save_weights_to=save_weights_to,
              make_image_summary=make_image_summary,
              scale_dotproduct=hparams.scale_dotproduct,
          )
          utils.collect_named_outputs(
              "norms", "image_feat_self_attention_%d"%(layer),
              tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs(
              "norms", "image_feat_self_attention_postprocess_%d"%(layer),
              tf.norm(x, axis=-1))
        with tf.variable_scope("ffn"):
          y = common_layers.dense_relu_dense(
              common_layers.layer_preprocess(x, hparams),
              image_filter_size,
              image_hidden_size,
              dropout=hparams.relu_dropout,
          )
          utils.collect_named_outputs(
              "norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1))
          x = common_layers.layer_postprocess(x, y, hparams)
          utils.collect_named_outputs(
              "norms", "image_feat_ffn_postprocess_%d"%(layer),
              tf.norm(x, axis=-1))
    # if normalization is done in layer_preprocess, then it should also be done
    # on the output, since the output can grow very large, being the sum of
    # a whole stack of unnormalized layer outputs.
    return common_layers.layer_preprocess(x, hparams)