def image_encoder(image_feat, hparams, name="image_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = image_feat with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, None, hparams.attention_key_channels or hparams.image_hidden_size, hparams.attention_value_channels or hparams.image_hidden_size, hparams.image_hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, save_weights_to=save_weights_to, max_relative_position=None, make_image_summary=make_image_summary, dropout_broadcast_dims=None, max_length=None, vars_3d=False, scale_otproduct=hparams.scale_dotproduct) utils.collect_named_outputs("norms", "image_feat_self_attention", tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_self_attention_zero_add", tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), hparams.image_filter_size, hparams.image_hidden_size, dropout=hparams.relu_dropout, dropout_broadcast_dims=None) utils.collect_named_outputs("norms", "image_feat_ffn", tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs("norms", "image_feat_ffn_zero_add", tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def question_encoder(question, question_self_attention_bias, hparams, name="question_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = question with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, question_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.question_self_attention_type, block_length=hparams.block_length, save_weights_to=save_weights_to, make_image_summary=make_image_summary, scale_dotproduct=hparams.scale_dotproduct, ) utils.collect_named_outputs( "norms", "query_self_attention_%d" % (layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "query_self_attention_postprocess_%d" % (layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), hparams.filter_size, hparams.hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs("norms", "query_ffn_%d" % (layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "query_ffn_postprocess_%d" % (layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)
def image_encoder(image_feat, hparams, name="image_encoder", save_weights_to=None, make_image_summary=True): """A stack of self attention layers.""" x = image_feat image_hidden_size = hparams.image_hidden_size or hparams.hidden_size image_filter_size = hparams.image_filter_size or hparams.filter_size with tf.variable_scope(name): for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("self_attention"): y = vqa_layers.multihead_attention( common_layers.layer_preprocess(x, hparams), None, None, hparams.attention_key_channels or image_hidden_size, hparams.attention_value_channels or image_hidden_size, image_hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.image_self_attention_type, save_weights_to=save_weights_to, make_image_summary=make_image_summary, scale_dotproduct=hparams.scale_dotproduct, ) utils.collect_named_outputs( "norms", "image_feat_self_attention_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_self_attention_postprocess_%d"%(layer), tf.norm(x, axis=-1)) with tf.variable_scope("ffn"): y = common_layers.dense_relu_dense( common_layers.layer_preprocess(x, hparams), image_filter_size, image_hidden_size, dropout=hparams.relu_dropout, ) utils.collect_named_outputs( "norms", "image_feat_ffn_%d"%(layer), tf.norm(y, axis=-1)) x = common_layers.layer_postprocess(x, y, hparams) utils.collect_named_outputs( "norms", "image_feat_ffn_postprocess_%d"%(layer), tf.norm(x, axis=-1)) # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(x, hparams)