def build_caption_attention(self): """ caption_attention """ num_V_ft = self.batch['num_boxes'] v_linear_v = self.mid_result['v_linear_v'] w_embed = tf.nn.embedding_lookup(self.v_word_map, self.batch['cap_att/word_tokens']) w_L_ft = modules.fc_layer( # [bs, #proposal, len, L_DIM] w_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_word_fc') w_len = self.batch['cap_att/word_tokens_len'] mask = tf.sequence_mask( # [bs, #proposal, len] w_len, maxlen=tf.shape(w_L_ft)[-2], dtype=tf.float32) pooled_w_L_ft = tf.reduce_sum(w_L_ft * tf.expand_dims(mask, axis=-1), axis=-2) pooled_w_L_ft = pooled_w_L_ft / \ tf.expand_dims(tf.to_float(w_len), axis=-1) l_linear_v = modules.fc_layer( pooled_w_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_v') tile_v_linear_v = tf.tile(tf.expand_dims(v_linear_v, axis=1), [1, self.data_cfg.n_cap_att, 1, 1]) flat_tile_v_linear_v = tf.reshape(tile_v_linear_v, [-1, self.data_cfg.max_box_num, V_DIM]) tile_num_V_ft = tf.tile(tf.expand_dims(num_V_ft, axis=1), [1, self.data_cfg.n_cap_att]) flat_tile_num_V_ft = tf.reshape(tile_num_V_ft, [-1]) flat_l_linear_v = tf.reshape(l_linear_v, [-1, V_DIM]) # flat_att_logit: [bs * #obj, num_proposal] flat_att_logit = modules.hadamard_attention( flat_tile_v_linear_v, flat_tile_num_V_ft, flat_l_linear_v, use_ln=False, is_train=self.is_train, normalizer=None) n_entry = self.data_cfg.n_cap_att n_proposal = self.data_cfg.max_box_num logit = tf.reshape(flat_att_logit, [-1, n_entry, n_proposal]) with tf.name_scope('loss/caption_attend'): multilabel_gt = tf.to_float( tf.greater(self.batch['cap_att/att_scores'], 0.5)) num_valid_entry = self.batch['cap_att/num'] valid_mask = tf.sequence_mask( num_valid_entry, maxlen=self.data_cfg.n_cap_att, dtype=tf.float32) loss, acc, recall, precision, top_1_prec, top_k_recall = \ self.binary_classification_loss(logit, multilabel_gt, valid_mask, depth=self.data_cfg.max_box_num) self.losses['caption_att'] = loss self.report['caption_att_loss'] = loss self.report['caption_att_acc'] = acc self.report['caption_att_recall'] = recall self.report['caption_att_precision'] = precision self.report['caption_att_top_1_prec'] = top_1_prec self.report['caption_att_top_{}_recall'.format(TOP_K)] = top_k_recall
def build_attribute_blank_fill(self): """ attribute_blank_fill """ # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim] V_ft = tf.matmul(self.batch['attr_blank_fill/weights'], self.batch['image_ft']) v_linear_l = modules.fc_layer(V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') blank_embed = tf.nn.embedding_lookup( # [bs, #proposal, len, W_DIM] self.l_word_map, self.batch['attr_blank_fill/blanks']) blank_len = self.batch['attr_blank_fill/blanks_len'] blank_maxlen = tf.shape(blank_embed)[-2] flat_blank_ft = modules.encode_L( # [bs * #proposal, L_DIM] tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]), tf.reshape(blank_len, [-1]), L_DIM, scope='encode_L_blank', cell_type='GRU') blank_ft = tf.reshape(flat_blank_ft, [-1, self.data_cfg.n_obj_bf, L_DIM]) l_linear_l = modules.fc_layer(blank_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result[ 'attr_blank_fill/logit'] = logit # [bs, #attr, #answer] with tf.name_scope('loss/attr_blank_fill'): onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['attr_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_attr_bf, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['attr_blank_fill'] = loss self.report['attr_blank_fill_loss'] = loss self.report['attr_blank_fill_acc'] = acc self.report['attr_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
def build(self): """ build network architecture and loss """ """ Visual features """ with tf.device('/cpu:0'): def load_feature(image_idx): selected_features = np.take(self.features, image_idx, axis=0) return selected_features V_ft = tf.py_func( load_feature, inp=[self.batch['image_idx']], Tout=tf.float32, name='sample_features') # [B, # of box, dim] V_ft.set_shape([None, self.max_box_num, self.vfeat_dim]) num_V_ft = tf.gather(self.num_boxes, self.batch['image_idx'], name='gather_num_V_ft', axis=0) self.mid_result['num_V_ft'] = num_V_ft normal_boxes = tf.gather(self.normal_boxes, self.batch['image_idx'], name='gather_normal_boxes', axis=0) self.mid_result['normal_boxes'] = normal_boxes log.warning('v_linear_v') # [B, # of box, V_DIM] v_linear_v = modules.fc_layer( V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_linear_v') """ Average pooling """ # [B, # of box, V_DIM] -> [B, V_DIM] avg_pooled_V_ft tf.reduce_mean(V_ft, 1, keepdims=False) # [B, R_DIM * 2] c_0_h_0 = modules.fc_layer( avg_pooled_V_ft, R_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='c_0_h_0') # c_0, h_0: [B, R_DIM] c_0, h_0 = tf.split(c_0_h_0, [R_DIM, R_DIM], 1) state_in = tc.LSTMStateTuple(c_0, h_0) self.att_lstm = tc.BasicLSTMCell(R_DIM) self.lang_lstm = tc.BasicLSTMCell(R_DIM) lstm_out, lstm_state = tf.nn.dynamic_rnn( self.lang_lstm, # [batch_size, max_time, ...] lstm_in, # [batch_size, cell.state_size] initial_state=state_in, time_major=False) lstm_c, lstm_h = lstm_state # TODO(taehoon): time should be shifted # gate [batch_size, max_time, 1] gate = modules.fc_layer( lstm_h, 1, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.sigmoid, is_training=self.is_train, scope='gate') # visual sentinel [batch_size, max_time, R_DIM] sentinel = gate * tf.nn.tanh(lstm_c) sentinel_linear = modules.fc_layer( sentinel, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='sentinel_linear') h_linear = modules.fc_layer( lstm_h, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='h_linear') logit = tf.nn.tanh(tf.concat([state_t, lstm_h])) """ Answer classification """ # perform two layer feature encoding and predict output with tf.variable_scope('reasoning') as scope: log.warning(scope.name) # [bs, L_DIM] log.warning('pooled_linear_l') pooled_linear_l = modules.fc_layer( pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') log.warning('q_linear_l') q_linear_l = modules.fc_layer( q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer( pooled_linear_l * q_linear_l, 2048, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer( joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.output['logit'] = logit """ Compute loss and accuracy """ with tf.name_scope('loss'): answer_target = self.batch['answer_target'] loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=answer_target, logits=logit) loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1)) pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32) one_hot_pred = tf.one_hot(pred, depth=self.num_answer, dtype=tf.float32) self.output['pred'] = pred all_score = tf.reduce_sum(one_hot_pred * answer_target, axis=-1) max_train_score = tf.reduce_max( answer_target * self.train_answer_mask, axis=-1) self.output['all_score'] = all_score self.output['max_train_score'] = max_train_score acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target, axis=-1)) exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask, axis=-1)) test_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask, axis=-1)) test_obj_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1)) test_attr_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1)) train_exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1)) max_train_exist_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.train_answer_mask, axis=-1)) test_obj_max_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask * self.obj_answer_mask, axis=-1)) test_attr_max_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask * self.attr_answer_mask, axis=-1)) test_max_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask, axis=-1)) test_max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.test_answer_mask, axis=-1)) normal_test_obj_acc = tf.where( tf.equal(test_obj_max_acc, 0), test_obj_max_acc, test_obj_acc / test_obj_max_acc) normal_test_attr_acc = tf.where( tf.equal(test_attr_max_acc, 0), test_attr_max_acc, test_attr_acc / test_attr_max_acc) normal_train_exist_acc = tf.where( tf.equal(max_train_exist_acc, 0), max_train_exist_acc, train_exist_acc / max_train_exist_acc) normal_exist_acc = tf.where( tf.equal(max_exist_answer_acc, 0), max_exist_answer_acc, exist_acc / max_exist_answer_acc) normal_test_acc = tf.where( tf.equal(test_max_answer_acc, 0), test_max_answer_acc, test_acc / test_max_answer_acc) self.mid_result['pred'] = pred self.losses['answer'] = loss self.report['answer_train_loss'] = loss self.report['answer_report_loss'] = loss self.report['answer_acc'] = acc self.report['exist_acc'] = exist_acc self.report['test_acc'] = test_acc self.report['normal_test_acc'] = normal_test_acc self.report['normal_test_object_acc'] = normal_test_obj_acc self.report['normal_test_attribute_acc'] = normal_test_attr_acc self.report['normal_exist_acc'] = normal_exist_acc self.report['normal_train_exist_acc'] = normal_train_exist_acc self.report['max_exist_acc'] = max_exist_answer_acc self.report['test_max_acc'] = test_max_answer_acc self.report['test_max_exist_acc'] = test_max_exist_answer_acc """ Prepare image summary """ """ with tf.name_scope('prepare_summary'): self.vis_image['image_attention_qa'] = self.visualize_vqa_result( self.batch['image_id'], self.mid_result['normal_boxes'], self.mid_result['num_V_ft'], self.mid_result['att_score'], self.batch['q_intseq'], self.batch['q_intseq_len'], self.batch['answer_target'], self.mid_result['pred'], max_batch_num=20, line_width=2) """ self.loss = self.losses['answer'] # scalar summary for key, val in self.report.items(): tf.summary.scalar('train/{}'.format(key), val, collections=['heavy_train', 'train']) tf.summary.scalar('val/{}'.format(key), val, collections=['heavy_val', 'val']) tf.summary.scalar('testval/{}'.format(key), val, collections=['heavy_testval', 'testval']) # image summary for key, val in self.vis_image.items(): tf.summary.image('train-{}'.format(key), val, max_outputs=10, collections=['heavy_train']) tf.summary.image('val-{}'.format(key), val, max_outputs=10, collections=['heavy_val']) tf.summary.image('testval-{}'.format(key), val, max_outputs=10, collections=['heavy_testval']) return self.loss
def build(self): """ build network architecture and loss """ """ Visual features """ with tf.device('/cpu:0'): def load_feature(image_idx): selected_features = np.take(self.features, image_idx, axis=0) return selected_features V_ft = tf.py_func(load_feature, inp=[self.batch['image_idx']], Tout=tf.float32, name='sample_features') V_ft.set_shape([None, self.max_box_num, self.vfeat_dim]) num_V_ft = tf.gather(self.num_boxes, self.batch['image_idx'], name='gather_num_V_ft', axis=0) self.mid_result['num_V_ft'] = num_V_ft normal_boxes = tf.gather(self.normal_boxes, self.batch['image_idx'], name='gather_normal_boxes', axis=0) self.mid_result['normal_boxes'] = normal_boxes log.warning('v_linear_v') v_linear_v = modules.fc_layer(V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='v_linear_v') """ Encode question """ q_embed = tf.nn.embedding_lookup(self.glove_map, self.batch['q_intseq']) # [bs, L_DIM] q_L_ft = modules.encode_L(q_embed, self.batch['q_intseq_len'], L_DIM, cell_type='GRU') q_L_mean = modules.fc_layer(q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='q_L_mean') # [bs, V_DIM} log.warning('q_linear_v') q_linear_v = modules.fc_layer(q_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_v') self.mid_result['q_linear_v'] = q_linear_v """ Perform attention """ att_score = modules.hadamard_attention(v_linear_v, num_V_ft, q_linear_v, use_ln=False, is_train=self.is_train) self.mid_result['att_score'] = att_score pooled_V_ft = modules.attention_pooling(V_ft, att_score) self.mid_result['pooled_V_ft'] = pooled_V_ft """ Answer classification """ log.warning('pooled_linear_l') pooled_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') self.mid_result['pooled_linear_l'] = pooled_linear_l log.warning('q_linear_l') l_linear_l = modules.fc_layer(q_L_mean, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') self.mid_result['l_linear_l'] = l_linear_l joint = modules.fc_layer(pooled_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) self.mid_result['joint'] = joint logit = modules.WordWeightAnswer(joint, self.answer_dict, self.word_weight_dir, use_bias=True, is_training=self.is_train, scope='WordWeightAnswer') self.mid_result['logit'] = logit """ Compute loss and accuracy """ with tf.name_scope('loss'): answer_target = self.batch['answer_target'] loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=answer_target, logits=logit) train_loss = tf.reduce_mean( tf.reduce_sum(loss * self.train_answer_mask, axis=-1)) report_loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1)) pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32) one_hot_pred = tf.one_hot(pred, depth=self.num_answer, dtype=tf.float32) acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target, axis=-1)) exist_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask, axis=-1)) test_acc = tf.reduce_mean( tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask, axis=-1)) max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1)) test_max_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.test_answer_mask, axis=-1)) test_max_exist_answer_acc = tf.reduce_mean( tf.reduce_max(answer_target * self.answer_exist_mask * self.test_answer_mask, axis=-1)) normal_test_acc = tf.where(tf.equal(test_max_answer_acc, 0), test_max_answer_acc, test_acc / test_max_answer_acc) self.mid_result['pred'] = pred self.losses['answer'] = train_loss self.report['answer_train_loss'] = train_loss self.report['answer_report_loss'] = report_loss self.report['answer_accuracy'] = acc self.report['exist_answer_accuracy'] = exist_acc self.report['test_answer_accuracy'] = test_acc self.report['normal_test_answer_accuracy'] = normal_test_acc self.report['max_exist_answer_accuracy'] = max_exist_answer_acc self.report['test_max_answer_accuracy'] = test_max_answer_acc self.report[ 'test_max_exist_answer_accuracy'] = test_max_exist_answer_acc """ Prepare image summary """ """ with tf.name_scope('prepare_summary'): self.vis_image['image_attention_qa'] = self.visualize_vqa_result( self.batch['image_id'], self.mid_result['normal_boxes'], self.mid_result['num_V_ft'], self.mid_result['att_score'], self.batch['q_intseq'], self.batch['q_intseq_len'], self.batch['answer_target'], self.mid_result['pred'], max_batch_num=20, line_width=2) """ self.loss = 0 for key, loss in self.losses.items(): self.loss = self.loss + loss # scalar summary for key, val in self.report.items(): tf.summary.scalar('train/{}'.format(key), val, collections=['heavy_train', 'train']) tf.summary.scalar('val/{}'.format(key), val, collections=['heavy_val', 'val']) tf.summary.scalar('testval/{}'.format(key), val, collections=['heavy_testval', 'testval']) # image summary for key, val in self.vis_image.items(): tf.summary.image('train-{}'.format(key), val, max_outputs=10, collections=['heavy_train']) tf.summary.image('val-{}'.format(key), val, max_outputs=10, collections=['heavy_val']) tf.summary.image('testval-{}'.format(key), val, max_outputs=10, collections=['heavy_testval']) return self.loss
def build_attribute_predict(self): """ attribute_predict """ # [#attr, #proposal] x [#proposal x feat_dim] -> [#attr,feat_dim] V_ft = tf.matmul(self.batch['attr_pred/weights'], self.batch['image_ft']) v_linear_l = modules.fc_layer(V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') L_ft = tf.nn.embedding_lookup(self.l_answer_word_map, self.batch['attr_pred/object_labels']) reg_l_ft = modules.fc_layer(L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.tanh, is_training=self.is_train, scope='attr_pred/encode_object_labels') self.mid_result['attr_pred/reg_l_ft'] = reg_l_ft l_linear_l = modules.fc_layer(reg_l_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['attr_pred/logit'] = logit # [bs, #attr, #answer] with tf.name_scope('loss/attr_predict'): multilabel_gt = self.batch['attr_pred/labels'] num_valid_entry = self.batch['attr_pred/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_attr_pred, dtype=tf.float32) loss, acc, recall, precision, top_1_prec, top_k_recall = \ self.binary_classification_loss(logit, multilabel_gt, valid_mask, depth=self.num_answer) self.losses['attr_pred'] = loss self.report['attr_pred_loss'] = loss self.report['attr_pred_acc'] = acc self.report['attr_pred_recall'] = recall self.report['attr_pred_precision'] = precision self.report['attr_pred_top_1_prec'] = top_1_prec self.report['attr_pred_top_{}_recall'.format(TOP_K)] = top_k_recall
def build_attribute_wordset(self): """ attribute_wordset """ V_ft = self.mid_result['attribute_V_ft'] num_V_ft = self.mid_result['attribute_num_V_ft'] v_linear_v = modules.fc_layer( # [bs * #attr, #proposal, V_DIM] V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='wordset_v_linear_v') wordset_embed = tf.tanh( tf.nn.embedding_lookup(self.wordset_map, self.batch['attr_blank_fill/wordsets'])) wordset_ft = modules.fc_layer(wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft') q_linear_v = modules.fc_layer( # [bs, #attr, V_DIM] wordset_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='wordset_q_linear_v') flat_q_linear_v = tf.reshape(q_linear_v, [-1, V_DIM]) # [bs * #attr, V_DIM] att_score = modules.hadamard_attention( # [bs * #attr, len] v_linear_v, num_V_ft, flat_q_linear_v, use_ln=False, is_train=self.is_train, scope='wordset_att') flat_pooled_V_ft = modules.attention_pooling( V_ft, att_score) # [bs * #attr, V_DIM] pooled_V_ft = tf.reshape( flat_pooled_V_ft, [-1, self.data_cfg.n_attr_bf, self.data_cfg.vfeat_dim]) v_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer(wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result[ 'attr_blank_fill/logit'] = logit # [bs, #attr, #answer] with tf.name_scope('loss/attr_wordset'): onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['attr_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_attr_bf, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['attr_wordset'] = loss self.report['attr_wordset_loss'] = loss self.report['attr_wordset_acc'] = acc self.report['attr_wordset_top_{}_acc'.format(TOP_K)] = top_k_acc
def build_object_blank_fill(self): """ object_blank_fill """ V_ft = self.mid_result['object_V_ft'] num_V_ft = self.mid_result['object_num_V_ft'] v_linear_v = modules.fc_layer( # [bs * #obj, #proposal, V_DIM] V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='bf_v_linear_v') blank_embed = tf.nn.embedding_lookup( # [bs, #proposal, len, W_DIM] self.l_word_map, self.batch['obj_blank_fill/blanks']) blank_len = self.batch['obj_blank_fill/blanks_len'] blank_maxlen = tf.shape(blank_embed)[-2] flat_blank_ft = modules.encode_L( # [bs * #proposal, L_DIM] tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]), tf.reshape(blank_len, [-1]), L_DIM, scope='encode_L_blank', cell_type='GRU') blank_ft = tf.reshape(flat_blank_ft, [-1, self.data_cfg.n_obj_bf, L_DIM]) q_linear_v = modules.fc_layer( # [bs, #obj, V_DIM] blank_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='bf_q_linear_v') flat_q_linear_v = tf.reshape(q_linear_v, [-1, V_DIM]) # [bs * #obj, V_DIM] att_score = modules.hadamard_attention( # [bs * #obj, len] v_linear_v, num_V_ft, flat_q_linear_v, use_ln=False, is_train=self.is_train, scope='bf_att') flat_pooled_V_ft = modules.attention_pooling( V_ft, att_score) # [bs * #obj, vfeat_dim] pooled_V_ft = tf.reshape( flat_pooled_V_ft, [-1, self.data_cfg.n_obj_bf, self.data_cfg.vfeat_dim]) v_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer(blank_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['obj_blank_fill/logit'] = logit # [bs, #obj, #answer] with tf.name_scope('loss/obj_blank_fill'): onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['obj_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_bf, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['obj_blank_fill'] = loss self.report['obj_blank_fill_loss'] = loss self.report['obj_blank_fill_acc'] = acc self.report['obj_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
def build_object_enwiki(self): """ object_enwiki """ pooled_V_ft = self.mid_result['object_pooled_V_ft'] enwiki_embed = tf.nn.embedding_lookup( # [bs, #proposal, len, W_DIM] self.enwiki_map, self.batch['obj_blank_fill/enwiki_context']) enwiki_len = self.batch['obj_blank_fill/enwiki_context_len'] enwiki_maxlen = tf.shape(enwiki_embed)[-2] flat_enwiki_ft = modules.encode_L( # [bs * #proposal, L_DIM] tf.reshape(enwiki_embed, [-1, enwiki_maxlen, W_DIM]), tf.reshape(enwiki_len, [-1]), L_DIM, scope='encode_L_enwiki', cell_type='GRU') enwiki_ft = tf.reshape(flat_enwiki_ft, [-1, self.data_cfg.n_obj_bf, L_DIM]) v_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer(enwiki_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') v_joint = modules.fc_layer(v_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_v') v_joint = tf.nn.dropout(v_joint, 0.5) l_joint = modules.fc_layer(l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_l') l_joint = tf.nn.dropout(l_joint, 0.5) v_logit = modules.fc_layer(v_joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier_v') l_logit = modules.fc_layer(l_joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier_l') with tf.name_scope('loss/obj_enwiki'): onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['obj_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_bf, dtype=tf.float32) v_loss, v_acc, v_top_k_acc = \ self.n_way_classification_loss(v_logit, onehot_gt, valid_mask) l_loss, l_acc, l_top_k_acc = \ self.n_way_classification_loss(l_logit, onehot_gt, valid_mask) self.losses['obj_enwiki_v'] = v_loss self.losses['obj_enwiki_l'] = l_loss self.report['obj_enwiki_v_loss'] = v_loss self.report['obj_enwiki_l_loss'] = l_loss self.report['obj_enwiki_v_acc'] = v_acc self.report['obj_enwiki_l_acc'] = l_acc self.report['obj_enwiki_v_top_{}_acc'.format(TOP_K)] = v_top_k_acc self.report['obj_enwiki_l_top_{}_acc'.format(TOP_K)] = l_top_k_acc
def build_attribute_wordset(self): """ attribute_wordset """ pooled_V_ft = self.mid_result['attribute_pooled_V_ft'] wordset_embed = tf.tanh(tf.nn.embedding_lookup( self.wordset_map, self.batch['attr_blank_fill/wordsets'])) wordset_ft = modules.fc_layer( wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft') v_linear_l = modules.fc_layer( pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer( wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') v_joint = modules.fc_layer( v_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_v') v_joint = tf.nn.dropout(v_joint, 0.5) l_joint = modules.fc_layer( l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_l') l_joint = tf.nn.dropout(l_joint, 0.5) v_logit = modules.fc_layer( v_joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier_v') l_logit = modules.fc_layer( l_joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier_l') with tf.name_scope('loss/attr_wordset'): onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['attr_blank_fill/num'] valid_mask = tf.sequence_mask( num_valid_entry, maxlen=self.data_cfg.n_attr_bf, dtype=tf.float32) v_loss, v_acc, v_top_k_acc = \ self.n_way_classification_loss(v_logit, onehot_gt, valid_mask) l_loss, l_acc, l_top_k_acc = \ self.n_way_classification_loss(l_logit, onehot_gt, valid_mask) self.losses['attr_wordset_v'] = v_loss self.losses['attr_wordset_l'] = l_loss self.report['attr_wordset_v_loss'] = v_loss self.report['attr_wordset_l_loss'] = l_loss self.report['attr_wordset_v_acc'] = v_acc self.report['attr_wordset_l_acc'] = l_acc self.report['attr_wordset_v_top_{}_acc'.format(TOP_K)] = v_top_k_acc self.report['attr_wordset_l_top_{}_acc'.format(TOP_K)] = l_top_k_acc
def build_object_wordset(self): """ object_wordset """ pooled_V_ft = self.mid_result['object_pooled_V_ft'] wordset_embed = tf.tanh( tf.nn.embedding_lookup( # [bs, #obj, W_DIM] self.wordset_map, self.batch['obj_blank_fill/wordsets'])) wordset_ft = modules.fc_layer( # [bs, #obj, L_DIM] wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft') v_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer(wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['obj_blank_fill/logit'] = logit # [bs, #obj, #answer] with tf.name_scope('loss/obj_wordset'): onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['obj_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_bf, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['obj_wordset'] = loss self.report['obj_wordset_loss'] = loss self.report['obj_wordset_acc'] = acc self.report['obj_wordset_top_{}_acc'.format(TOP_K)] = top_k_acc
def build_attribute_V_ft(self): V_ft = self.batch['image_ft'] # [bs, #proposal, #feat_dim] V_ft = tf.expand_dims(V_ft, axis=1) # [bs, 1, #proposal, #feat_dim] V_ft = tf.tile(V_ft, [1, self.data_cfg.n_attr_bf, 1, 1 ]) # [bs, #attr, #proposal, #feat_dim] V_ft = tf.reshape( V_ft, [-1, self.data_cfg.max_box_num, self.data_cfg.vfeat_dim ]) # [bs * #attr, #proposal, #feat_dim] spat_ft = self.batch['spatial_ft'] spat_ft = tf.expand_dims(spat_ft, axis=1) spat_ft = tf.tile(spat_ft, [1, self.data_cfg.n_attr_bf, 1, 1]) spat_ft = tf.reshape(spat_ft, [-1, self.data_cfg.max_box_num, 6]) num_V_ft = self.batch['num_boxes'] # [bs] num_V_ft = tf.expand_dims(num_V_ft, axis=1) # [bs, 1] num_V_ft = tf.tile(num_V_ft, [1, self.data_cfg.n_attr_bf]) # [bs, #attr] num_V_ft = tf.reshape(num_V_ft, [-1]) # [bs * #attr] key_spat_ft = self.batch['attr_blank_fill/normal_boxes'] key_spat_ft = tf.concat([ key_spat_ft, tf.expand_dims(key_spat_ft[:, :, 2] - key_spat_ft[:, :, 0], axis=-1), tf.expand_dims(key_spat_ft[:, :, 3] - key_spat_ft[:, :, 1], axis=-1) ], axis=-1) v_linear_v = modules.fc_layer( # [bs * #obj, #proposal, V_DIM] spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_v_linear_v') q_linear_v = modules.fc_layer( # [bs, #obj, V_DIM] key_spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_q_linear_v') flat_q_linear_v = tf.reshape(q_linear_v, [-1, V_DIM]) # [bs * #obj, V_DIM] att_score = modules.hadamard_attention( # [bs * #obj, len] v_linear_v, num_V_ft, flat_q_linear_v, use_ln=False, is_train=self.is_train, scope='spat_att') flat_pooled_V_ft = modules.attention_pooling( V_ft, att_score) # [bs * #obj, vfeat_dim] pooled_V_ft = tf.reshape( flat_pooled_V_ft, [-1, self.data_cfg.n_attr_bf, self.data_cfg.vfeat_dim]) self.mid_result['attribute_pooled_V_ft'] = pooled_V_ft
def build_object_predict(self): """ object_predict """ V_ft = self.batch['image_ft'] # [bs, #proposal, #feat_dim] V_ft = tf.expand_dims(V_ft, axis=1) # [bs, 1, #proposal, #feat_dim] V_ft = tf.tile(V_ft, [1, self.data_cfg.n_obj_pred, 1, 1 ]) # [bs, #obj, #proposal, #feat_dim] V_ft = tf.reshape( V_ft, [-1, self.data_cfg.max_box_num, self.data_cfg.vfeat_dim ]) # [bs * #obj, #proposal, #feat_dim] spat_ft = self.batch['spatial_ft'] spat_ft = tf.expand_dims(spat_ft, axis=1) spat_ft = tf.tile(spat_ft, [1, self.data_cfg.n_obj_pred, 1, 1]) spat_ft = tf.reshape(spat_ft, [-1, self.data_cfg.max_box_num, 6]) num_V_ft = self.batch['num_boxes'] # [bs] num_V_ft = tf.expand_dims(num_V_ft, axis=1) # [bs, 1] num_V_ft = tf.tile(num_V_ft, [1, self.data_cfg.n_obj_pred]) # [bs, #obj] num_V_ft = tf.reshape(num_V_ft, [-1]) # [bs * #obj] key_spat_ft = self.batch['obj_pred/normal_boxes'] key_spat_ft = tf.concat([ key_spat_ft, tf.expand_dims(key_spat_ft[:, :, 2] - key_spat_ft[:, :, 0], axis=-1), tf.expand_dims(key_spat_ft[:, :, 3] - key_spat_ft[:, :, 1], axis=-1) ], axis=-1) v_linear_v = modules.fc_layer( # [bs * #obj, #proposal, V_DIM] spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_v_linear_v') q_linear_v = modules.fc_layer( # [bs, #obj, V_DIM] key_spat_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='spat_q_linear_v') flat_q_linear_v = tf.reshape(q_linear_v, [-1, V_DIM]) # [bs * #obj, V_DIM] att_score = modules.hadamard_attention( # [bs * #obj, len] v_linear_v, num_V_ft, flat_q_linear_v, use_ln=False, is_train=self.is_train, scope='spat_att') flat_pooled_V_ft = modules.attention_pooling( V_ft, att_score) # [bs * #obj, vfeat_dim] pooled_V_ft = tf.reshape( flat_pooled_V_ft, [-1, self.data_cfg.n_obj_pred, self.data_cfg.vfeat_dim]) wordset_embed = tf.tanh( tf.nn.embedding_lookup( # [bs, #obj, W_DIM] self.wordset_map, self.batch['obj_pred/wordsets'])) wordset_ft = modules.fc_layer( # [bs, #obj, L_DIM] wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft') v_linear_l = modules.fc_layer(pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') l_linear_l = modules.fc_layer(wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['obj_pred/logit'] = logit # [bs, #obj, #answer] with tf.name_scope('loss/object_predict'): onehot_gt = tf.one_hot(self.batch['obj_pred/labels'], depth=self.num_answer) num_valid_entry = self.batch['obj_pred/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_pred, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['object_pred'] = loss self.report['object_pred_loss'] = loss self.report['object_pred_acc'] = acc self.report['object_pred_top_{}_acc'.format(TOP_K)] = top_k_acc
ws_dict_path = os.path.join( config.data_dir, 'wordset_dict5_depth{}.pkl'.format(int(config.expand_depth))) ws_dict = cPickle.load(open(ws_dict_path, 'rb')) num_ws = len(ws_dict['vocab']) wordset_map = modules.learn_embedding_map(ws_dict, scope='wordset_map') L_DIM = 1024 wordset_embed = tf.tanh(wordset_map) wordset_ft = modules.fc_layer(wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=False, scope='wordset_ft') session_config = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True), device_count={'GPU': 1}) sess = tf.Session(config=session_config) all_vars = tf.global_variables() checkpoint_loader = tf.train.Saver(var_list=all_vars, max_to_keep=1) log.info('Checkpoint path: {}'.format(config.checkpoint)) checkpoint_loader.restore(sess, config.checkpoint) log.info('Loaded the checkpoint')
def build_object_blank_fill(self): """ object_blank_fill """ # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim] V_ft = tf.matmul(self.batch['obj_blank_fill/weights'], self.batch['image_ft']) v_linear_l = modules.fc_layer(V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') blank_embed = tf.nn.embedding_lookup( # [bs, #proposal, len, W_DIM] self.l_word_map, self.batch['obj_blank_fill/blanks']) blank_len = self.batch['obj_blank_fill/blanks_len'] blank_maxlen = tf.shape(blank_embed)[-2] flat_blank_ft = modules.encode_L( # [bs * #proposal, L_DIM] tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]), tf.reshape(blank_len, [-1]), L_DIM, scope='encode_L_blank', cell_type='GRU') blank_ft = tf.reshape(flat_blank_ft, [-1, self.data_cfg.n_obj_bf, L_DIM]) fill_embed = tf.nn.embedding_lookup(self.l_answer_word_map, self.batch['obj_blank_fill/fills']) fill_embed2 = modules.fc_layer(fill_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.tanh, is_training=self.is_train, scope='obj_blank_fill/fill_embed2') blank_fill_ft = blank_ft * fill_embed2 fill_vec = modules.fc_layer(blank_fill_ft, L_DIM, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='obj_blank_fill/fill_vec') fill_log_sigma_sq = modules.fc_layer( blank_fill_ft, L_DIM, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='obj_blank_fill/fill_log_sigma_sq') fill_sigma = tf.sqrt(tf.exp(fill_log_sigma_sq)) noise = tf.random_normal(tf.shape(fill_vec), mean=0, stddev=1, seed=123) fill_vec_noise = fill_vec + noise * fill_sigma self.vis_hist['obj_blank_fill/fill_vec'] = fill_vec self.vis_hist['obj_blank_fill/fill_sigma'] = fill_sigma l_linear_l = modules.fc_layer(fill_vec_noise, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['obj_blank_fill/logit'] = logit # [bs, #obj, #answer] with tf.name_scope('loss/obj_blank_fill'): onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'], depth=self.num_answer) num_valid_entry = self.batch['obj_blank_fill/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_bf, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) latent_loss = self.latent_loss(fill_vec, fill_log_sigma_sq) self.losses['obj_blank_fill'] = loss self.losses[ 'obj_blank_fill_latent'] = self.latent_loss_weight * latent_loss self.report['obj_blank_fill_loss'] = loss self.report['obj_blank_fill_latent_loss'] = latent_loss self.report['obj_blank_fill_train_latent_loss'] = self.losses[ 'obj_blank_fill_latent'] self.report['obj_blank_fill_acc'] = acc self.report['obj_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
def build_attribute_predict(self): """ attribute_predict """ # [#attr, #proposal] x [#proposal x feat_dim] -> [#attr,feat_dim] V_ft = tf.matmul(self.batch['attr_pred/weights'], self.batch['image_ft']) v_linear_l = modules.fc_layer(V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') obj_embed = tf.nn.embedding_lookup( self.l_answer_word_map, self.batch['attr_pred/object_labels']) obj_vec = modules.fc_layer(obj_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.tanh, is_training=self.is_train, scope='attr_pred/obj_vector') # [bs, #attr, #answer_vocab] x [#answer_vocab, W_DIM] # -> [bs, #attr, W_DIM] attr_embed = tf.nn.embedding_lookup( self.l_answer_word_map, self.batch['attr_pred/random_attribute_labels']) attr_embed2 = modules.fc_layer(attr_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.tanh, is_training=self.is_train, scope='attr_pred/attr_embed2') obj_attr_embed = obj_vec * attr_embed2 attr_vec = modules.fc_layer(obj_attr_embed, L_DIM, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='attr_pred/attr_vec') attr_log_sigma_sq = modules.fc_layer( obj_attr_embed, L_DIM, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='attr_pred/attr_log_sigma_sq') attr_sigma = tf.sqrt(tf.exp(attr_log_sigma_sq)) noise = tf.random_normal(tf.shape(attr_vec), mean=0, stddev=1, seed=123) attr_vec_noise = attr_vec + noise * attr_sigma self.vis_hist['attr_pred/attr_vec'] = attr_vec self.vis_hist['attr_pred/attr_sigma'] = attr_sigma l_linear_l = modules.fc_layer(attr_vec_noise, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['attr_pred/logit'] = logit # [bs, #attr, #answer] with tf.name_scope('loss/attr_predict'): onehot_gt = tf.one_hot( self.batch['attr_pred/random_attribute_labels'], depth=self.num_answer) num_valid_entry = self.batch['attr_pred/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_attr_pred, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) latent_loss = self.latent_loss(attr_vec, attr_log_sigma_sq) self.losses['attr_pred'] = loss self.losses[ 'attr_pred_latent'] = self.latent_loss_weight * latent_loss self.report['attr_pred_loss'] = loss self.report['attr_pred_latent_loss'] = latent_loss self.report['attr_pred_train_latent_loss'] = self.losses[ 'attr_pred_latent'] self.report['attr_pred_acc'] = acc self.report['attr_pred_top_{}_acc'.format(TOP_K)] = top_k_acc
def build_object_predict(self): """ object_predict """ # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim] V_ft = tf.matmul(self.batch['obj_pred/weights'], self.batch['image_ft']) v_linear_l = modules.fc_layer(V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='pooled_linear_l') wordset_embed = tf.tanh( tf.nn.embedding_lookup(self.wordset_map, self.batch['obj_pred/wordsets'])) wordset_ft = modules.fc_layer(wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft2') l_linear_l = modules.fc_layer(wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='q_linear_l') joint = modules.fc_layer(v_linear_l * l_linear_l, L_DIM * 2, use_bias=True, use_bn=False, use_ln=True, activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc') joint = tf.nn.dropout(joint, 0.5) logit = modules.fc_layer(joint, self.num_answer, use_bias=True, use_bn=False, use_ln=False, activation_fn=None, is_training=self.is_train, scope='classifier') self.mid_result['obj_pred/logit'] = logit # [bs, #obj, #answer] with tf.name_scope('loss/object_predict'): onehot_gt = tf.one_hot(self.batch['obj_pred/labels'], depth=self.num_answer) num_valid_entry = self.batch['obj_pred/num'] valid_mask = tf.sequence_mask(num_valid_entry, maxlen=self.data_cfg.n_obj_pred, dtype=tf.float32) loss, acc, top_k_acc = \ self.n_way_classification_loss(logit, onehot_gt, valid_mask) self.losses['object_pred'] = loss self.report['object_pred_loss'] = loss self.report['object_pred_acc'] = acc self.report['object_pred_top_{}_acc'.format(TOP_K)] = top_k_acc