def build_caption_attention(self):
        """
        caption_attention
        """
        num_V_ft = self.batch['num_boxes']
        v_linear_v = self.mid_result['v_linear_v']

        w_embed = tf.nn.embedding_lookup(self.v_word_map,
                                         self.batch['cap_att/word_tokens'])
        w_L_ft = modules.fc_layer(  # [bs, #proposal, len, L_DIM]
            w_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='v_word_fc')
        w_len = self.batch['cap_att/word_tokens_len']
        mask = tf.sequence_mask(  # [bs, #proposal, len]
            w_len, maxlen=tf.shape(w_L_ft)[-2],
            dtype=tf.float32)
        pooled_w_L_ft = tf.reduce_sum(w_L_ft * tf.expand_dims(mask, axis=-1),
                                      axis=-2)
        pooled_w_L_ft = pooled_w_L_ft / \
            tf.expand_dims(tf.to_float(w_len), axis=-1)

        l_linear_v = modules.fc_layer(
            pooled_w_L_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='q_linear_v')

        tile_v_linear_v = tf.tile(tf.expand_dims(v_linear_v, axis=1),
                                  [1, self.data_cfg.n_cap_att, 1, 1])
        flat_tile_v_linear_v = tf.reshape(tile_v_linear_v,
                                          [-1, self.data_cfg.max_box_num, V_DIM])
        tile_num_V_ft = tf.tile(tf.expand_dims(num_V_ft, axis=1),
                                [1, self.data_cfg.n_cap_att])
        flat_tile_num_V_ft = tf.reshape(tile_num_V_ft, [-1])

        flat_l_linear_v = tf.reshape(l_linear_v, [-1, V_DIM])

        # flat_att_logit: [bs * #obj, num_proposal]
        flat_att_logit = modules.hadamard_attention(
            flat_tile_v_linear_v, flat_tile_num_V_ft, flat_l_linear_v,
            use_ln=False, is_train=self.is_train, normalizer=None)

        n_entry = self.data_cfg.n_cap_att
        n_proposal = self.data_cfg.max_box_num
        logit = tf.reshape(flat_att_logit, [-1, n_entry, n_proposal])

        with tf.name_scope('loss/caption_attend'):
            multilabel_gt = tf.to_float(
                tf.greater(self.batch['cap_att/att_scores'], 0.5))
            num_valid_entry = self.batch['cap_att/num']
            valid_mask = tf.sequence_mask(
                num_valid_entry, maxlen=self.data_cfg.n_cap_att,
                dtype=tf.float32)
            loss, acc, recall, precision, top_1_prec, top_k_recall = \
                self.binary_classification_loss(logit, multilabel_gt, valid_mask,
                                                depth=self.data_cfg.max_box_num)
            self.losses['caption_att'] = loss
            self.report['caption_att_loss'] = loss
            self.report['caption_att_acc'] = acc
            self.report['caption_att_recall'] = recall
            self.report['caption_att_precision'] = precision
            self.report['caption_att_top_1_prec'] = top_1_prec
            self.report['caption_att_top_{}_recall'.format(TOP_K)] = top_k_recall
    def build_attribute_blank_fill(self):
        """
        attribute_blank_fill
        """
        # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim]
        V_ft = tf.matmul(self.batch['attr_blank_fill/weights'],
                         self.batch['image_ft'])
        v_linear_l = modules.fc_layer(V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        blank_embed = tf.nn.embedding_lookup(  # [bs, #proposal, len, W_DIM]
            self.l_word_map, self.batch['attr_blank_fill/blanks'])
        blank_len = self.batch['attr_blank_fill/blanks_len']
        blank_maxlen = tf.shape(blank_embed)[-2]
        flat_blank_ft = modules.encode_L(  # [bs * #proposal, L_DIM]
            tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]),
            tf.reshape(blank_len, [-1]),
            L_DIM,
            scope='encode_L_blank',
            cell_type='GRU')
        blank_ft = tf.reshape(flat_blank_ft,
                              [-1, self.data_cfg.n_obj_bf, L_DIM])

        l_linear_l = modules.fc_layer(blank_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result[
            'attr_blank_fill/logit'] = logit  # [bs, #attr, #answer]

        with tf.name_scope('loss/attr_blank_fill'):
            onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['attr_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_attr_bf,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['attr_blank_fill'] = loss
            self.report['attr_blank_fill_loss'] = loss
            self.report['attr_blank_fill_acc'] = acc
            self.report['attr_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
    def build(self):
        """
        build network architecture and loss
        """

        """
        Visual features
        """
        with tf.device('/cpu:0'):
            def load_feature(image_idx):
                selected_features = np.take(self.features, image_idx, axis=0)
                return selected_features
            V_ft = tf.py_func(
                load_feature, inp=[self.batch['image_idx']], Tout=tf.float32,
                name='sample_features')
            # [B, # of box, dim]
            V_ft.set_shape([None, self.max_box_num, self.vfeat_dim])
            num_V_ft = tf.gather(self.num_boxes, self.batch['image_idx'],
                                 name='gather_num_V_ft', axis=0)
            self.mid_result['num_V_ft'] = num_V_ft
            normal_boxes = tf.gather(self.normal_boxes, self.batch['image_idx'],
                                     name='gather_normal_boxes', axis=0)
            self.mid_result['normal_boxes'] = normal_boxes

        log.warning('v_linear_v')
        # [B, # of box, V_DIM]
        v_linear_v = modules.fc_layer(
            V_ft, V_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='v_linear_v')

        """
        Average pooling
        """
        # [B, # of box, V_DIM] -> [B, V_DIM]
        avg_pooled_V_ft tf.reduce_mean(V_ft, 1, keepdims=False)

        # [B, R_DIM * 2]
        c_0_h_0 = modules.fc_layer(
            avg_pooled_V_ft, R_DIM * 2, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='c_0_h_0')

        # c_0, h_0: [B, R_DIM]
        c_0, h_0 = tf.split(c_0_h_0, [R_DIM, R_DIM], 1)
        state_in = tc.LSTMStateTuple(c_0, h_0)

        self.att_lstm = tc.BasicLSTMCell(R_DIM)
        self.lang_lstm = tc.BasicLSTMCell(R_DIM)

        lstm_out, lstm_state = tf.nn.dynamic_rnn(
            self.lang_lstm,
            # [batch_size, max_time, ...]
            lstm_in,
            # [batch_size, cell.state_size]
            initial_state=state_in,
            time_major=False)

        lstm_c, lstm_h = lstm_state

        # TODO(taehoon): time should be shifted

        # gate [batch_size, max_time, 1]
        gate = modules.fc_layer(
            lstm_h, 1, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.sigmoid, is_training=self.is_train,
            scope='gate')
        # visual sentinel [batch_size, max_time, R_DIM]
        sentinel = gate * tf.nn.tanh(lstm_c)

        sentinel_linear = modules.fc_layer(
            sentinel, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='sentinel_linear')
        h_linear = modules.fc_layer(
            lstm_h, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='h_linear')

        logit = tf.nn.tanh(tf.concat([state_t, lstm_h]))

        """
        Answer classification
        """
        # perform two layer feature encoding and predict output
        with tf.variable_scope('reasoning') as scope:
            log.warning(scope.name)
            # [bs, L_DIM]
            log.warning('pooled_linear_l')
            pooled_linear_l = modules.fc_layer(
                pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True,
                activation_fn=tf.nn.relu, is_training=self.is_train,
                scope='pooled_linear_l')

            log.warning('q_linear_l')
            q_linear_l = modules.fc_layer(
                q_L_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True,
                activation_fn=tf.nn.relu, is_training=self.is_train,
                scope='q_linear_l')

            joint = modules.fc_layer(
                pooled_linear_l * q_linear_l, 2048,
                use_bias=True, use_bn=False, use_ln=True,
                activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_fc')
            joint = tf.nn.dropout(joint, 0.5)

            logit = modules.fc_layer(
                joint, self.num_answer,
                use_bias=True, use_bn=False, use_ln=False,
                activation_fn=None, is_training=self.is_train, scope='classifier')
        self.output['logit'] = logit

        """
        Compute loss and accuracy
        """
        with tf.name_scope('loss'):
            answer_target = self.batch['answer_target']
            loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=answer_target, logits=logit)
            loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1))
            pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32)
            one_hot_pred = tf.one_hot(pred, depth=self.num_answer,
                                      dtype=tf.float32)
            self.output['pred'] = pred
            all_score = tf.reduce_sum(one_hot_pred * answer_target, axis=-1)
            max_train_score = tf.reduce_max(
                answer_target * self.train_answer_mask, axis=-1)
            self.output['all_score'] = all_score
            self.output['max_train_score'] = max_train_score

            acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target, axis=-1))
            exist_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask,
                              axis=-1))
            test_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask,
                              axis=-1))
            test_obj_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask *
                              self.obj_answer_mask, axis=-1))
            test_attr_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target * self.test_answer_mask *
                              self.attr_answer_mask, axis=-1))
            train_exist_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target * self.answer_exist_mask *
                              self.train_answer_mask,
                              axis=-1))
            max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1))
            max_train_exist_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask *
                              self.train_answer_mask, axis=-1))
            test_obj_max_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.test_answer_mask *
                              self.obj_answer_mask, axis=-1))
            test_attr_max_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.test_answer_mask *
                              self.attr_answer_mask, axis=-1))
            test_max_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.test_answer_mask, axis=-1))
            test_max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask *
                              self.test_answer_mask, axis=-1))
            normal_test_obj_acc = tf.where(
                tf.equal(test_obj_max_acc, 0),
                test_obj_max_acc,
                test_obj_acc / test_obj_max_acc)
            normal_test_attr_acc = tf.where(
                tf.equal(test_attr_max_acc, 0),
                test_attr_max_acc,
                test_attr_acc / test_attr_max_acc)
            normal_train_exist_acc = tf.where(
                tf.equal(max_train_exist_acc, 0),
                max_train_exist_acc,
                train_exist_acc / max_train_exist_acc)
            normal_exist_acc = tf.where(
                tf.equal(max_exist_answer_acc, 0),
                max_exist_answer_acc,
                exist_acc / max_exist_answer_acc)
            normal_test_acc = tf.where(
                tf.equal(test_max_answer_acc, 0),
                test_max_answer_acc,
                test_acc / test_max_answer_acc)

            self.mid_result['pred'] = pred

            self.losses['answer'] = loss
            self.report['answer_train_loss'] = loss
            self.report['answer_report_loss'] = loss
            self.report['answer_acc'] = acc
            self.report['exist_acc'] = exist_acc
            self.report['test_acc'] = test_acc
            self.report['normal_test_acc'] = normal_test_acc
            self.report['normal_test_object_acc'] = normal_test_obj_acc
            self.report['normal_test_attribute_acc'] = normal_test_attr_acc
            self.report['normal_exist_acc'] = normal_exist_acc
            self.report['normal_train_exist_acc'] = normal_train_exist_acc
            self.report['max_exist_acc'] = max_exist_answer_acc
            self.report['test_max_acc'] = test_max_answer_acc
            self.report['test_max_exist_acc'] = test_max_exist_answer_acc

        """
        Prepare image summary
        """
        """
        with tf.name_scope('prepare_summary'):
            self.vis_image['image_attention_qa'] = self.visualize_vqa_result(
                self.batch['image_id'],
                self.mid_result['normal_boxes'], self.mid_result['num_V_ft'],
                self.mid_result['att_score'],
                self.batch['q_intseq'], self.batch['q_intseq_len'],
                self.batch['answer_target'], self.mid_result['pred'],
                max_batch_num=20, line_width=2)
        """

        self.loss = self.losses['answer']

        # scalar summary
        for key, val in self.report.items():
            tf.summary.scalar('train/{}'.format(key), val,
                              collections=['heavy_train', 'train'])
            tf.summary.scalar('val/{}'.format(key), val,
                              collections=['heavy_val', 'val'])
            tf.summary.scalar('testval/{}'.format(key), val,
                              collections=['heavy_testval', 'testval'])

        # image summary
        for key, val in self.vis_image.items():
            tf.summary.image('train-{}'.format(key), val, max_outputs=10,
                             collections=['heavy_train'])
            tf.summary.image('val-{}'.format(key), val, max_outputs=10,
                             collections=['heavy_val'])
            tf.summary.image('testval-{}'.format(key), val, max_outputs=10,
                             collections=['heavy_testval'])

        return self.loss
예제 #4
0
    def build(self):
        """
        build network architecture and loss
        """
        """
        Visual features
        """
        with tf.device('/cpu:0'):

            def load_feature(image_idx):
                selected_features = np.take(self.features, image_idx, axis=0)
                return selected_features

            V_ft = tf.py_func(load_feature,
                              inp=[self.batch['image_idx']],
                              Tout=tf.float32,
                              name='sample_features')
            V_ft.set_shape([None, self.max_box_num, self.vfeat_dim])
            num_V_ft = tf.gather(self.num_boxes,
                                 self.batch['image_idx'],
                                 name='gather_num_V_ft',
                                 axis=0)
            self.mid_result['num_V_ft'] = num_V_ft
            normal_boxes = tf.gather(self.normal_boxes,
                                     self.batch['image_idx'],
                                     name='gather_normal_boxes',
                                     axis=0)
            self.mid_result['normal_boxes'] = normal_boxes

        log.warning('v_linear_v')
        v_linear_v = modules.fc_layer(V_ft,
                                      V_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='v_linear_v')
        """
        Encode question
        """
        q_embed = tf.nn.embedding_lookup(self.glove_map,
                                         self.batch['q_intseq'])
        # [bs, L_DIM]
        q_L_ft = modules.encode_L(q_embed,
                                  self.batch['q_intseq_len'],
                                  L_DIM,
                                  cell_type='GRU')
        q_L_mean = modules.fc_layer(q_L_ft,
                                    L_DIM,
                                    use_bias=True,
                                    use_bn=False,
                                    use_ln=False,
                                    activation_fn=None,
                                    is_training=self.is_train,
                                    scope='q_L_mean')

        # [bs, V_DIM}
        log.warning('q_linear_v')
        q_linear_v = modules.fc_layer(q_L_ft,
                                      V_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_v')
        self.mid_result['q_linear_v'] = q_linear_v
        """
        Perform attention
        """
        att_score = modules.hadamard_attention(v_linear_v,
                                               num_V_ft,
                                               q_linear_v,
                                               use_ln=False,
                                               is_train=self.is_train)
        self.mid_result['att_score'] = att_score
        pooled_V_ft = modules.attention_pooling(V_ft, att_score)
        self.mid_result['pooled_V_ft'] = pooled_V_ft
        """
        Answer classification
        """
        log.warning('pooled_linear_l')
        pooled_linear_l = modules.fc_layer(pooled_V_ft,
                                           L_DIM,
                                           use_bias=True,
                                           use_bn=False,
                                           use_ln=True,
                                           activation_fn=tf.nn.relu,
                                           is_training=self.is_train,
                                           scope='pooled_linear_l')
        self.mid_result['pooled_linear_l'] = pooled_linear_l

        log.warning('q_linear_l')
        l_linear_l = modules.fc_layer(q_L_mean,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')
        self.mid_result['l_linear_l'] = l_linear_l

        joint = modules.fc_layer(pooled_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)
        self.mid_result['joint'] = joint

        logit = modules.WordWeightAnswer(joint,
                                         self.answer_dict,
                                         self.word_weight_dir,
                                         use_bias=True,
                                         is_training=self.is_train,
                                         scope='WordWeightAnswer')
        self.mid_result['logit'] = logit
        """
        Compute loss and accuracy
        """
        with tf.name_scope('loss'):
            answer_target = self.batch['answer_target']
            loss = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=answer_target, logits=logit)

            train_loss = tf.reduce_mean(
                tf.reduce_sum(loss * self.train_answer_mask, axis=-1))
            report_loss = tf.reduce_mean(tf.reduce_sum(loss, axis=-1))
            pred = tf.cast(tf.argmax(logit, axis=-1), dtype=tf.int32)
            one_hot_pred = tf.one_hot(pred,
                                      depth=self.num_answer,
                                      dtype=tf.float32)
            acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target, axis=-1))
            exist_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target *
                              self.answer_exist_mask,
                              axis=-1))
            test_acc = tf.reduce_mean(
                tf.reduce_sum(one_hot_pred * answer_target *
                              self.test_answer_mask,
                              axis=-1))
            max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask, axis=-1))
            test_max_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.test_answer_mask, axis=-1))
            test_max_exist_answer_acc = tf.reduce_mean(
                tf.reduce_max(answer_target * self.answer_exist_mask *
                              self.test_answer_mask,
                              axis=-1))
            normal_test_acc = tf.where(tf.equal(test_max_answer_acc,
                                                0), test_max_answer_acc,
                                       test_acc / test_max_answer_acc)

            self.mid_result['pred'] = pred

            self.losses['answer'] = train_loss
            self.report['answer_train_loss'] = train_loss
            self.report['answer_report_loss'] = report_loss
            self.report['answer_accuracy'] = acc
            self.report['exist_answer_accuracy'] = exist_acc
            self.report['test_answer_accuracy'] = test_acc
            self.report['normal_test_answer_accuracy'] = normal_test_acc
            self.report['max_exist_answer_accuracy'] = max_exist_answer_acc
            self.report['test_max_answer_accuracy'] = test_max_answer_acc
            self.report[
                'test_max_exist_answer_accuracy'] = test_max_exist_answer_acc
        """
        Prepare image summary
        """
        """
        with tf.name_scope('prepare_summary'):
            self.vis_image['image_attention_qa'] = self.visualize_vqa_result(
                self.batch['image_id'],
                self.mid_result['normal_boxes'], self.mid_result['num_V_ft'],
                self.mid_result['att_score'],
                self.batch['q_intseq'], self.batch['q_intseq_len'],
                self.batch['answer_target'], self.mid_result['pred'],
                max_batch_num=20, line_width=2)
        """

        self.loss = 0
        for key, loss in self.losses.items():
            self.loss = self.loss + loss

        # scalar summary
        for key, val in self.report.items():
            tf.summary.scalar('train/{}'.format(key),
                              val,
                              collections=['heavy_train', 'train'])
            tf.summary.scalar('val/{}'.format(key),
                              val,
                              collections=['heavy_val', 'val'])
            tf.summary.scalar('testval/{}'.format(key),
                              val,
                              collections=['heavy_testval', 'testval'])

        # image summary
        for key, val in self.vis_image.items():
            tf.summary.image('train-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_train'])
            tf.summary.image('val-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_val'])
            tf.summary.image('testval-{}'.format(key),
                             val,
                             max_outputs=10,
                             collections=['heavy_testval'])

        return self.loss
    def build_attribute_predict(self):
        """
        attribute_predict
        """
        # [#attr, #proposal] x [#proposal x feat_dim] -> [#attr,feat_dim]
        V_ft = tf.matmul(self.batch['attr_pred/weights'],
                         self.batch['image_ft'])
        v_linear_l = modules.fc_layer(V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        L_ft = tf.nn.embedding_lookup(self.l_answer_word_map,
                                      self.batch['attr_pred/object_labels'])
        reg_l_ft = modules.fc_layer(L_ft,
                                    L_DIM,
                                    use_bias=True,
                                    use_bn=False,
                                    use_ln=True,
                                    activation_fn=tf.nn.tanh,
                                    is_training=self.is_train,
                                    scope='attr_pred/encode_object_labels')
        self.mid_result['attr_pred/reg_l_ft'] = reg_l_ft

        l_linear_l = modules.fc_layer(reg_l_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['attr_pred/logit'] = logit  # [bs, #attr, #answer]

        with tf.name_scope('loss/attr_predict'):
            multilabel_gt = self.batch['attr_pred/labels']
            num_valid_entry = self.batch['attr_pred/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_attr_pred,
                                          dtype=tf.float32)
            loss, acc, recall, precision, top_1_prec, top_k_recall = \
                self.binary_classification_loss(logit, multilabel_gt, valid_mask,
                                                depth=self.num_answer)
            self.losses['attr_pred'] = loss
            self.report['attr_pred_loss'] = loss
            self.report['attr_pred_acc'] = acc
            self.report['attr_pred_recall'] = recall
            self.report['attr_pred_precision'] = precision
            self.report['attr_pred_top_1_prec'] = top_1_prec
            self.report['attr_pred_top_{}_recall'.format(TOP_K)] = top_k_recall
예제 #6
0
    def build_attribute_wordset(self):
        """
        attribute_wordset
        """
        V_ft = self.mid_result['attribute_V_ft']
        num_V_ft = self.mid_result['attribute_num_V_ft']

        v_linear_v = modules.fc_layer(  # [bs * #attr, #proposal, V_DIM]
            V_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='wordset_v_linear_v')

        wordset_embed = tf.tanh(
            tf.nn.embedding_lookup(self.wordset_map,
                                   self.batch['attr_blank_fill/wordsets']))
        wordset_ft = modules.fc_layer(wordset_embed,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.tanh,
                                      is_training=self.is_train,
                                      scope='wordset_ft')

        q_linear_v = modules.fc_layer(  # [bs, #attr, V_DIM]
            wordset_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='wordset_q_linear_v')
        flat_q_linear_v = tf.reshape(q_linear_v,
                                     [-1, V_DIM])  # [bs * #attr, V_DIM]

        att_score = modules.hadamard_attention(  # [bs * #attr, len]
            v_linear_v,
            num_V_ft,
            flat_q_linear_v,
            use_ln=False,
            is_train=self.is_train,
            scope='wordset_att')
        flat_pooled_V_ft = modules.attention_pooling(
            V_ft, att_score)  # [bs * #attr, V_DIM]
        pooled_V_ft = tf.reshape(
            flat_pooled_V_ft,
            [-1, self.data_cfg.n_attr_bf, self.data_cfg.vfeat_dim])

        v_linear_l = modules.fc_layer(pooled_V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(wordset_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result[
            'attr_blank_fill/logit'] = logit  # [bs, #attr, #answer]

        with tf.name_scope('loss/attr_wordset'):
            onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['attr_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_attr_bf,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['attr_wordset'] = loss
            self.report['attr_wordset_loss'] = loss
            self.report['attr_wordset_acc'] = acc
            self.report['attr_wordset_top_{}_acc'.format(TOP_K)] = top_k_acc
예제 #7
0
    def build_object_blank_fill(self):
        """
        object_blank_fill
        """
        V_ft = self.mid_result['object_V_ft']
        num_V_ft = self.mid_result['object_num_V_ft']

        v_linear_v = modules.fc_layer(  # [bs * #obj, #proposal, V_DIM]
            V_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='bf_v_linear_v')

        blank_embed = tf.nn.embedding_lookup(  # [bs, #proposal, len, W_DIM]
            self.l_word_map, self.batch['obj_blank_fill/blanks'])
        blank_len = self.batch['obj_blank_fill/blanks_len']
        blank_maxlen = tf.shape(blank_embed)[-2]
        flat_blank_ft = modules.encode_L(  # [bs * #proposal, L_DIM]
            tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]),
            tf.reshape(blank_len, [-1]),
            L_DIM,
            scope='encode_L_blank',
            cell_type='GRU')
        blank_ft = tf.reshape(flat_blank_ft,
                              [-1, self.data_cfg.n_obj_bf, L_DIM])

        q_linear_v = modules.fc_layer(  # [bs, #obj, V_DIM]
            blank_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='bf_q_linear_v')
        flat_q_linear_v = tf.reshape(q_linear_v,
                                     [-1, V_DIM])  # [bs * #obj, V_DIM]

        att_score = modules.hadamard_attention(  # [bs * #obj, len]
            v_linear_v,
            num_V_ft,
            flat_q_linear_v,
            use_ln=False,
            is_train=self.is_train,
            scope='bf_att')
        flat_pooled_V_ft = modules.attention_pooling(
            V_ft, att_score)  # [bs * #obj, vfeat_dim]
        pooled_V_ft = tf.reshape(
            flat_pooled_V_ft,
            [-1, self.data_cfg.n_obj_bf, self.data_cfg.vfeat_dim])

        v_linear_l = modules.fc_layer(pooled_V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(blank_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['obj_blank_fill/logit'] = logit  # [bs, #obj, #answer]

        with tf.name_scope('loss/obj_blank_fill'):
            onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_bf,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['obj_blank_fill'] = loss
            self.report['obj_blank_fill_loss'] = loss
            self.report['obj_blank_fill_acc'] = acc
            self.report['obj_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
    def build_object_enwiki(self):
        """
        object_enwiki
        """
        pooled_V_ft = self.mid_result['object_pooled_V_ft']

        enwiki_embed = tf.nn.embedding_lookup(  # [bs, #proposal, len, W_DIM]
            self.enwiki_map, self.batch['obj_blank_fill/enwiki_context'])
        enwiki_len = self.batch['obj_blank_fill/enwiki_context_len']
        enwiki_maxlen = tf.shape(enwiki_embed)[-2]
        flat_enwiki_ft = modules.encode_L(  # [bs * #proposal, L_DIM]
            tf.reshape(enwiki_embed, [-1, enwiki_maxlen, W_DIM]),
            tf.reshape(enwiki_len, [-1]),
            L_DIM,
            scope='encode_L_enwiki',
            cell_type='GRU')
        enwiki_ft = tf.reshape(flat_enwiki_ft,
                               [-1, self.data_cfg.n_obj_bf, L_DIM])

        v_linear_l = modules.fc_layer(pooled_V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(enwiki_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        v_joint = modules.fc_layer(v_linear_l,
                                   L_DIM * 2,
                                   use_bias=True,
                                   use_bn=False,
                                   use_ln=True,
                                   activation_fn=tf.nn.relu,
                                   is_training=self.is_train,
                                   scope='joint_v')
        v_joint = tf.nn.dropout(v_joint, 0.5)

        l_joint = modules.fc_layer(l_linear_l,
                                   L_DIM * 2,
                                   use_bias=True,
                                   use_bn=False,
                                   use_ln=True,
                                   activation_fn=tf.nn.relu,
                                   is_training=self.is_train,
                                   scope='joint_l')
        l_joint = tf.nn.dropout(l_joint, 0.5)

        v_logit = modules.fc_layer(v_joint,
                                   self.num_answer,
                                   use_bias=True,
                                   use_bn=False,
                                   use_ln=False,
                                   activation_fn=None,
                                   is_training=self.is_train,
                                   scope='classifier_v')

        l_logit = modules.fc_layer(l_joint,
                                   self.num_answer,
                                   use_bias=True,
                                   use_bn=False,
                                   use_ln=False,
                                   activation_fn=None,
                                   is_training=self.is_train,
                                   scope='classifier_l')

        with tf.name_scope('loss/obj_enwiki'):
            onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_bf,
                                          dtype=tf.float32)

            v_loss, v_acc, v_top_k_acc = \
                self.n_way_classification_loss(v_logit, onehot_gt, valid_mask)
            l_loss, l_acc, l_top_k_acc = \
                self.n_way_classification_loss(l_logit, onehot_gt, valid_mask)
            self.losses['obj_enwiki_v'] = v_loss
            self.losses['obj_enwiki_l'] = l_loss
            self.report['obj_enwiki_v_loss'] = v_loss
            self.report['obj_enwiki_l_loss'] = l_loss
            self.report['obj_enwiki_v_acc'] = v_acc
            self.report['obj_enwiki_l_acc'] = l_acc
            self.report['obj_enwiki_v_top_{}_acc'.format(TOP_K)] = v_top_k_acc
            self.report['obj_enwiki_l_top_{}_acc'.format(TOP_K)] = l_top_k_acc
    def build_attribute_wordset(self):
        """
        attribute_wordset
        """
        pooled_V_ft = self.mid_result['attribute_pooled_V_ft']

        wordset_embed = tf.tanh(tf.nn.embedding_lookup(
            self.wordset_map, self.batch['attr_blank_fill/wordsets']))
        wordset_ft = modules.fc_layer(
            wordset_embed, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.tanh, is_training=self.is_train, scope='wordset_ft')

        v_linear_l = modules.fc_layer(
            pooled_V_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(
            wordset_ft, L_DIM, use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train,
            scope='q_linear_l')

        v_joint = modules.fc_layer(
            v_linear_l, L_DIM * 2,
            use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_v')
        v_joint = tf.nn.dropout(v_joint, 0.5)

        l_joint = modules.fc_layer(
            l_linear_l, L_DIM * 2,
            use_bias=True, use_bn=False, use_ln=True,
            activation_fn=tf.nn.relu, is_training=self.is_train, scope='joint_l')
        l_joint = tf.nn.dropout(l_joint, 0.5)

        v_logit = modules.fc_layer(
            v_joint, self.num_answer,
            use_bias=True, use_bn=False, use_ln=False,
            activation_fn=None, is_training=self.is_train, scope='classifier_v')

        l_logit = modules.fc_layer(
            l_joint, self.num_answer,
            use_bias=True, use_bn=False, use_ln=False,
            activation_fn=None, is_training=self.is_train, scope='classifier_l')

        with tf.name_scope('loss/attr_wordset'):
            onehot_gt = tf.one_hot(self.batch['attr_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['attr_blank_fill/num']
            valid_mask = tf.sequence_mask(
                num_valid_entry, maxlen=self.data_cfg.n_attr_bf,
                dtype=tf.float32)
            v_loss, v_acc, v_top_k_acc = \
                self.n_way_classification_loss(v_logit, onehot_gt, valid_mask)
            l_loss, l_acc, l_top_k_acc = \
                self.n_way_classification_loss(l_logit, onehot_gt, valid_mask)
            self.losses['attr_wordset_v'] = v_loss
            self.losses['attr_wordset_l'] = l_loss
            self.report['attr_wordset_v_loss'] = v_loss
            self.report['attr_wordset_l_loss'] = l_loss
            self.report['attr_wordset_v_acc'] = v_acc
            self.report['attr_wordset_l_acc'] = l_acc
            self.report['attr_wordset_v_top_{}_acc'.format(TOP_K)] = v_top_k_acc
            self.report['attr_wordset_l_top_{}_acc'.format(TOP_K)] = l_top_k_acc
    def build_object_wordset(self):
        """
        object_wordset
        """
        pooled_V_ft = self.mid_result['object_pooled_V_ft']

        wordset_embed = tf.tanh(
            tf.nn.embedding_lookup(  # [bs, #obj, W_DIM]
                self.wordset_map, self.batch['obj_blank_fill/wordsets']))
        wordset_ft = modules.fc_layer(  # [bs, #obj, L_DIM]
            wordset_embed,
            L_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.tanh,
            is_training=self.is_train,
            scope='wordset_ft')

        v_linear_l = modules.fc_layer(pooled_V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(wordset_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['obj_blank_fill/logit'] = logit  # [bs, #obj, #answer]

        with tf.name_scope('loss/obj_wordset'):
            onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_bf,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['obj_wordset'] = loss
            self.report['obj_wordset_loss'] = loss
            self.report['obj_wordset_acc'] = acc
            self.report['obj_wordset_top_{}_acc'.format(TOP_K)] = top_k_acc
    def build_attribute_V_ft(self):
        V_ft = self.batch['image_ft']  # [bs,  #proposal, #feat_dim]
        V_ft = tf.expand_dims(V_ft, axis=1)  # [bs, 1, #proposal, #feat_dim]
        V_ft = tf.tile(V_ft, [1, self.data_cfg.n_attr_bf, 1, 1
                              ])  # [bs, #attr, #proposal, #feat_dim]
        V_ft = tf.reshape(
            V_ft, [-1, self.data_cfg.max_box_num, self.data_cfg.vfeat_dim
                   ])  # [bs * #attr, #proposal, #feat_dim]
        spat_ft = self.batch['spatial_ft']
        spat_ft = tf.expand_dims(spat_ft, axis=1)
        spat_ft = tf.tile(spat_ft, [1, self.data_cfg.n_attr_bf, 1, 1])
        spat_ft = tf.reshape(spat_ft, [-1, self.data_cfg.max_box_num, 6])
        num_V_ft = self.batch['num_boxes']  # [bs]
        num_V_ft = tf.expand_dims(num_V_ft, axis=1)  # [bs, 1]
        num_V_ft = tf.tile(num_V_ft,
                           [1, self.data_cfg.n_attr_bf])  # [bs, #attr]
        num_V_ft = tf.reshape(num_V_ft, [-1])  # [bs * #attr]

        key_spat_ft = self.batch['attr_blank_fill/normal_boxes']
        key_spat_ft = tf.concat([
            key_spat_ft,
            tf.expand_dims(key_spat_ft[:, :, 2] - key_spat_ft[:, :, 0],
                           axis=-1),
            tf.expand_dims(key_spat_ft[:, :, 3] - key_spat_ft[:, :, 1],
                           axis=-1)
        ],
                                axis=-1)

        v_linear_v = modules.fc_layer(  # [bs * #obj, #proposal, V_DIM]
            spat_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='spat_v_linear_v')

        q_linear_v = modules.fc_layer(  # [bs, #obj, V_DIM]
            key_spat_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='spat_q_linear_v')
        flat_q_linear_v = tf.reshape(q_linear_v,
                                     [-1, V_DIM])  # [bs * #obj, V_DIM]

        att_score = modules.hadamard_attention(  # [bs * #obj, len]
            v_linear_v,
            num_V_ft,
            flat_q_linear_v,
            use_ln=False,
            is_train=self.is_train,
            scope='spat_att')
        flat_pooled_V_ft = modules.attention_pooling(
            V_ft, att_score)  # [bs * #obj, vfeat_dim]
        pooled_V_ft = tf.reshape(
            flat_pooled_V_ft,
            [-1, self.data_cfg.n_attr_bf, self.data_cfg.vfeat_dim])

        self.mid_result['attribute_pooled_V_ft'] = pooled_V_ft
    def build_object_predict(self):
        """
        object_predict
        """
        V_ft = self.batch['image_ft']  # [bs,  #proposal, #feat_dim]
        V_ft = tf.expand_dims(V_ft, axis=1)  # [bs, 1, #proposal, #feat_dim]
        V_ft = tf.tile(V_ft, [1, self.data_cfg.n_obj_pred, 1, 1
                              ])  # [bs, #obj, #proposal, #feat_dim]
        V_ft = tf.reshape(
            V_ft, [-1, self.data_cfg.max_box_num, self.data_cfg.vfeat_dim
                   ])  # [bs * #obj, #proposal, #feat_dim]
        spat_ft = self.batch['spatial_ft']
        spat_ft = tf.expand_dims(spat_ft, axis=1)
        spat_ft = tf.tile(spat_ft, [1, self.data_cfg.n_obj_pred, 1, 1])
        spat_ft = tf.reshape(spat_ft, [-1, self.data_cfg.max_box_num, 6])
        num_V_ft = self.batch['num_boxes']  # [bs]
        num_V_ft = tf.expand_dims(num_V_ft, axis=1)  # [bs, 1]
        num_V_ft = tf.tile(num_V_ft,
                           [1, self.data_cfg.n_obj_pred])  # [bs, #obj]
        num_V_ft = tf.reshape(num_V_ft, [-1])  # [bs * #obj]

        key_spat_ft = self.batch['obj_pred/normal_boxes']
        key_spat_ft = tf.concat([
            key_spat_ft,
            tf.expand_dims(key_spat_ft[:, :, 2] - key_spat_ft[:, :, 0],
                           axis=-1),
            tf.expand_dims(key_spat_ft[:, :, 3] - key_spat_ft[:, :, 1],
                           axis=-1)
        ],
                                axis=-1)

        v_linear_v = modules.fc_layer(  # [bs * #obj, #proposal, V_DIM]
            spat_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='spat_v_linear_v')

        q_linear_v = modules.fc_layer(  # [bs, #obj, V_DIM]
            key_spat_ft,
            V_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.nn.relu,
            is_training=self.is_train,
            scope='spat_q_linear_v')
        flat_q_linear_v = tf.reshape(q_linear_v,
                                     [-1, V_DIM])  # [bs * #obj, V_DIM]

        att_score = modules.hadamard_attention(  # [bs * #obj, len]
            v_linear_v,
            num_V_ft,
            flat_q_linear_v,
            use_ln=False,
            is_train=self.is_train,
            scope='spat_att')
        flat_pooled_V_ft = modules.attention_pooling(
            V_ft, att_score)  # [bs * #obj, vfeat_dim]
        pooled_V_ft = tf.reshape(
            flat_pooled_V_ft,
            [-1, self.data_cfg.n_obj_pred, self.data_cfg.vfeat_dim])

        wordset_embed = tf.tanh(
            tf.nn.embedding_lookup(  # [bs, #obj, W_DIM]
                self.wordset_map, self.batch['obj_pred/wordsets']))
        wordset_ft = modules.fc_layer(  # [bs, #obj, L_DIM]
            wordset_embed,
            L_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=True,
            activation_fn=tf.tanh,
            is_training=self.is_train,
            scope='wordset_ft')

        v_linear_l = modules.fc_layer(pooled_V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        l_linear_l = modules.fc_layer(wordset_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['obj_pred/logit'] = logit  # [bs, #obj, #answer]

        with tf.name_scope('loss/object_predict'):
            onehot_gt = tf.one_hot(self.batch['obj_pred/labels'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_pred/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_pred,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['object_pred'] = loss
            self.report['object_pred_loss'] = loss
            self.report['object_pred_acc'] = acc
            self.report['object_pred_top_{}_acc'.format(TOP_K)] = top_k_acc
예제 #13
0
ws_dict_path = os.path.join(
    config.data_dir,
    'wordset_dict5_depth{}.pkl'.format(int(config.expand_depth)))
ws_dict = cPickle.load(open(ws_dict_path, 'rb'))
num_ws = len(ws_dict['vocab'])

wordset_map = modules.learn_embedding_map(ws_dict, scope='wordset_map')

L_DIM = 1024

wordset_embed = tf.tanh(wordset_map)
wordset_ft = modules.fc_layer(wordset_embed,
                              L_DIM,
                              use_bias=True,
                              use_bn=False,
                              use_ln=True,
                              activation_fn=tf.tanh,
                              is_training=False,
                              scope='wordset_ft')

session_config = tf.ConfigProto(allow_soft_placement=True,
                                gpu_options=tf.GPUOptions(allow_growth=True),
                                device_count={'GPU': 1})
sess = tf.Session(config=session_config)

all_vars = tf.global_variables()
checkpoint_loader = tf.train.Saver(var_list=all_vars, max_to_keep=1)

log.info('Checkpoint path: {}'.format(config.checkpoint))
checkpoint_loader.restore(sess, config.checkpoint)
log.info('Loaded the checkpoint')
    def build_object_blank_fill(self):
        """
        object_blank_fill
        """
        # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim]
        V_ft = tf.matmul(self.batch['obj_blank_fill/weights'],
                         self.batch['image_ft'])
        v_linear_l = modules.fc_layer(V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        blank_embed = tf.nn.embedding_lookup(  # [bs, #proposal, len, W_DIM]
            self.l_word_map, self.batch['obj_blank_fill/blanks'])
        blank_len = self.batch['obj_blank_fill/blanks_len']
        blank_maxlen = tf.shape(blank_embed)[-2]
        flat_blank_ft = modules.encode_L(  # [bs * #proposal, L_DIM]
            tf.reshape(blank_embed, [-1, blank_maxlen, W_DIM]),
            tf.reshape(blank_len, [-1]),
            L_DIM,
            scope='encode_L_blank',
            cell_type='GRU')
        blank_ft = tf.reshape(flat_blank_ft,
                              [-1, self.data_cfg.n_obj_bf, L_DIM])

        fill_embed = tf.nn.embedding_lookup(self.l_answer_word_map,
                                            self.batch['obj_blank_fill/fills'])
        fill_embed2 = modules.fc_layer(fill_embed,
                                       L_DIM,
                                       use_bias=True,
                                       use_bn=False,
                                       use_ln=True,
                                       activation_fn=tf.nn.tanh,
                                       is_training=self.is_train,
                                       scope='obj_blank_fill/fill_embed2')

        blank_fill_ft = blank_ft * fill_embed2

        fill_vec = modules.fc_layer(blank_fill_ft,
                                    L_DIM,
                                    use_bias=True,
                                    use_bn=False,
                                    use_ln=False,
                                    activation_fn=None,
                                    is_training=self.is_train,
                                    scope='obj_blank_fill/fill_vec')
        fill_log_sigma_sq = modules.fc_layer(
            blank_fill_ft,
            L_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=False,
            activation_fn=None,
            is_training=self.is_train,
            scope='obj_blank_fill/fill_log_sigma_sq')
        fill_sigma = tf.sqrt(tf.exp(fill_log_sigma_sq))
        noise = tf.random_normal(tf.shape(fill_vec),
                                 mean=0,
                                 stddev=1,
                                 seed=123)
        fill_vec_noise = fill_vec + noise * fill_sigma
        self.vis_hist['obj_blank_fill/fill_vec'] = fill_vec
        self.vis_hist['obj_blank_fill/fill_sigma'] = fill_sigma

        l_linear_l = modules.fc_layer(fill_vec_noise,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['obj_blank_fill/logit'] = logit  # [bs, #obj, #answer]

        with tf.name_scope('loss/obj_blank_fill'):
            onehot_gt = tf.one_hot(self.batch['obj_blank_fill/fills'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_blank_fill/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_bf,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            latent_loss = self.latent_loss(fill_vec, fill_log_sigma_sq)
            self.losses['obj_blank_fill'] = loss
            self.losses[
                'obj_blank_fill_latent'] = self.latent_loss_weight * latent_loss
            self.report['obj_blank_fill_loss'] = loss
            self.report['obj_blank_fill_latent_loss'] = latent_loss
            self.report['obj_blank_fill_train_latent_loss'] = self.losses[
                'obj_blank_fill_latent']
            self.report['obj_blank_fill_acc'] = acc
            self.report['obj_blank_fill_top_{}_acc'.format(TOP_K)] = top_k_acc
    def build_attribute_predict(self):
        """
        attribute_predict
        """
        # [#attr, #proposal] x [#proposal x feat_dim] -> [#attr,feat_dim]
        V_ft = tf.matmul(self.batch['attr_pred/weights'],
                         self.batch['image_ft'])
        v_linear_l = modules.fc_layer(V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        obj_embed = tf.nn.embedding_lookup(
            self.l_answer_word_map, self.batch['attr_pred/object_labels'])
        obj_vec = modules.fc_layer(obj_embed,
                                   L_DIM,
                                   use_bias=True,
                                   use_bn=False,
                                   use_ln=True,
                                   activation_fn=tf.nn.tanh,
                                   is_training=self.is_train,
                                   scope='attr_pred/obj_vector')

        # [bs, #attr, #answer_vocab] x [#answer_vocab, W_DIM]
        # -> [bs, #attr, W_DIM]
        attr_embed = tf.nn.embedding_lookup(
            self.l_answer_word_map,
            self.batch['attr_pred/random_attribute_labels'])
        attr_embed2 = modules.fc_layer(attr_embed,
                                       L_DIM,
                                       use_bias=True,
                                       use_bn=False,
                                       use_ln=True,
                                       activation_fn=tf.nn.tanh,
                                       is_training=self.is_train,
                                       scope='attr_pred/attr_embed2')

        obj_attr_embed = obj_vec * attr_embed2

        attr_vec = modules.fc_layer(obj_attr_embed,
                                    L_DIM,
                                    use_bias=True,
                                    use_bn=False,
                                    use_ln=False,
                                    activation_fn=None,
                                    is_training=self.is_train,
                                    scope='attr_pred/attr_vec')
        attr_log_sigma_sq = modules.fc_layer(
            obj_attr_embed,
            L_DIM,
            use_bias=True,
            use_bn=False,
            use_ln=False,
            activation_fn=None,
            is_training=self.is_train,
            scope='attr_pred/attr_log_sigma_sq')
        attr_sigma = tf.sqrt(tf.exp(attr_log_sigma_sq))
        noise = tf.random_normal(tf.shape(attr_vec),
                                 mean=0,
                                 stddev=1,
                                 seed=123)
        attr_vec_noise = attr_vec + noise * attr_sigma
        self.vis_hist['attr_pred/attr_vec'] = attr_vec
        self.vis_hist['attr_pred/attr_sigma'] = attr_sigma

        l_linear_l = modules.fc_layer(attr_vec_noise,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['attr_pred/logit'] = logit  # [bs, #attr, #answer]

        with tf.name_scope('loss/attr_predict'):
            onehot_gt = tf.one_hot(
                self.batch['attr_pred/random_attribute_labels'],
                depth=self.num_answer)
            num_valid_entry = self.batch['attr_pred/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_attr_pred,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            latent_loss = self.latent_loss(attr_vec, attr_log_sigma_sq)
            self.losses['attr_pred'] = loss
            self.losses[
                'attr_pred_latent'] = self.latent_loss_weight * latent_loss
            self.report['attr_pred_loss'] = loss
            self.report['attr_pred_latent_loss'] = latent_loss
            self.report['attr_pred_train_latent_loss'] = self.losses[
                'attr_pred_latent']
            self.report['attr_pred_acc'] = acc
            self.report['attr_pred_top_{}_acc'.format(TOP_K)] = top_k_acc
예제 #16
0
    def build_object_predict(self):
        """
        object_predict
        """
        # [#obj, #proposal] x [#proposal x feat_dim] -> [#obj,feat_dim]
        V_ft = tf.matmul(self.batch['obj_pred/weights'],
                         self.batch['image_ft'])
        v_linear_l = modules.fc_layer(V_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='pooled_linear_l')

        wordset_embed = tf.tanh(
            tf.nn.embedding_lookup(self.wordset_map,
                                   self.batch['obj_pred/wordsets']))
        wordset_ft = modules.fc_layer(wordset_embed,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.tanh,
                                      is_training=self.is_train,
                                      scope='wordset_ft2')

        l_linear_l = modules.fc_layer(wordset_ft,
                                      L_DIM,
                                      use_bias=True,
                                      use_bn=False,
                                      use_ln=True,
                                      activation_fn=tf.nn.relu,
                                      is_training=self.is_train,
                                      scope='q_linear_l')

        joint = modules.fc_layer(v_linear_l * l_linear_l,
                                 L_DIM * 2,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=True,
                                 activation_fn=tf.nn.relu,
                                 is_training=self.is_train,
                                 scope='joint_fc')
        joint = tf.nn.dropout(joint, 0.5)

        logit = modules.fc_layer(joint,
                                 self.num_answer,
                                 use_bias=True,
                                 use_bn=False,
                                 use_ln=False,
                                 activation_fn=None,
                                 is_training=self.is_train,
                                 scope='classifier')
        self.mid_result['obj_pred/logit'] = logit  # [bs, #obj, #answer]

        with tf.name_scope('loss/object_predict'):
            onehot_gt = tf.one_hot(self.batch['obj_pred/labels'],
                                   depth=self.num_answer)
            num_valid_entry = self.batch['obj_pred/num']
            valid_mask = tf.sequence_mask(num_valid_entry,
                                          maxlen=self.data_cfg.n_obj_pred,
                                          dtype=tf.float32)
            loss, acc, top_k_acc = \
                self.n_way_classification_loss(logit, onehot_gt, valid_mask)
            self.losses['object_pred'] = loss
            self.report['object_pred_loss'] = loss
            self.report['object_pred_acc'] = acc
            self.report['object_pred_top_{}_acc'.format(TOP_K)] = top_k_acc