def visgeno_baseline_net(input_batch, bbox_batch, spatial_batch, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # bbox_batch has shape [N_box, 5] and # spatial_batch has shape [N_box, D_spatial] and # expr_obj has shape [T, N_batch] N_batch = tf.shape(expr_obj)[1] N_box = tf.shape(spatial_batch)[0] # Extract visual features vis_feat = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, bbox_batch, "vgg_local", apply_dropout=vgg_dropout) D_vis = vis_feat.get_shape().as_list()[-1] # Apply the same LSTM network on all expressions to extract their language # features. lang_obj = lstm_net.lstm_encoder(expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) scores_obj = modules.localization_module_grid_score( vis_feat, spatial_batch, lang_obj) return scores_obj
def refgoog_attbilstm_net(input_batch, bbox_batch, spatial_batch, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # bbox_batch has shape [N_box, 5] and # spatial_batch has shape [N_box, D_spatial] and # expr_obj has shape [T, N_batch] N_batch = tf.shape(expr_obj)[1] N_box = tf.shape(spatial_batch)[0] # Extract visual features vis_feat = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, bbox_batch, "vgg_local", apply_dropout=vgg_dropout) D_vis = vis_feat.get_shape().as_list()[-1] # Extract representation using attention lang_obj1, lang_obj2, lang_relation = lstm_net.attbilstm( expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) # Score for each bounding box matching the first object # scores_obj1 has shape [N_batch, N_box, 1] scores_obj1 = modules.localization_module_grid_score( vis_feat, spatial_batch, lang_obj1) # Score for each bounding box matching the second object # scores_obj2 has shape [N_batch, N_box, 1] scores_obj2 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj2, reuse=True) # Scores for each pair of bounding box matching the relationship # Tile the scores by broadcasting add # scores_rel has shape [N_batch, N_box, N_box, 1] scores_rel = modules.relationship_module_spatial_only_grid_score( spatial_batch, scores_obj1, spatial_batch, scores_obj2, lang_relation, rescale_scores=True) tf.add_to_collection("s_pair", scores_rel) # marginal_scores has shape [N_batch, N_box, 1] marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2) final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1])) return final_scores
def visual7w_baseline_net(input_batch, bbox_batch, spatial_batch, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # Extract visual features vis_feat = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, bbox_batch, "vgg_local", apply_dropout=vgg_dropout) # Apply LSTM network to extract language features. lang_obj = lstm_net.lstm_encoder(expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) # Score for each bounding box matching the object scores_obj = modules.localization_module(vis_feat, spatial_batch, lang_obj) return scores_obj
def visual7w_attbilstm_net(input_batch, bbox_batch1, spatial_batch1, bbox_batch2, spatial_batch2, expr_obj, num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout): # a sentence is parsed into [expr_obj1, expr_relation, expr_obj2] # bbox_batch1 has shape [N_batch*N1, 5] and # spatial_batch1 has shape [N_batch, N1, D_spatial] and # bbox_batch2 has shape [N2, 5] and # spatial_batch2 has shape [1, N2, D_spatial] and # expr_obj has shape [T, N_batch] # where N1 is the number of choices (= 4 in Visual 7W) and # N2 is the number of proposals (~ 300 for RPN in Faster RCNN) N_batch = tf.shape(spatial_batch1)[0] N1 = tf.shape(spatial_batch1)[1] N2 = tf.shape(spatial_batch2)[1] # Extract visual features vis_feat1 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, tf.reshape(bbox_batch1, [-1, 5]), "vgg_local", apply_dropout=vgg_dropout) D_vis = vis_feat1.get_shape().as_list()[-1] vis_feat1 = tf.reshape(vis_feat1, to_T([N_batch, N1, D_vis])) vis_feat1.set_shape([None, None, D_vis]) # Reshape and tile vis_feat2 and spatial_batch2 vis_feat2 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, tf.reshape(bbox_batch2, [-1, 5]), "vgg_local", apply_dropout=vgg_dropout, reuse=True) vis_feat2 = tf.reshape(vis_feat2, to_T([1, N2, D_vis])) vis_feat2 = tf.tile(vis_feat2, to_T([N_batch, 1, 1])) vis_feat2.set_shape([None, None, D_vis]) spatial_batch2 = tf.tile(spatial_batch2, to_T([N_batch, 1, 1])) # Extract representation using attention lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm( expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim, apply_dropout=lstm_dropout) # Score for each bounding box matching the first object # scores_obj1 has shape [N_batch, N1, 1] scores_obj1 = modules.localization_module_batch_score( vis_feat1, spatial_batch1, lang_obj1) # Score for each bounding box matching the second object # scores_obj2 has shape [N_batch, N2, 1] scores_obj2 = modules.localization_module_batch_score(vis_feat2, spatial_batch2, lang_obj2, reuse=True) # Scores for each pair of bounding box matching the relationship # Tile the scores by broadcasting add # scores_rel has shape [N_batch, N1, N2, 1] scores_rel = modules.relationship_module_spatial_only_batch_score( spatial_batch1, scores_obj1, spatial_batch2, scores_obj2, lang_relation, rescale_scores=True) # marginal_scores has shape [N_batch, N1, 1] tf.add_to_collection("s_pair", scores_rel) marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2) final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1])) return final_scores