예제 #1
0
    def __init__(self):
        self.T_encoder = 26
        data_root = '/usr/data/fl302/code/n2nmn/exp_vqa/data'
        snapshot_file = '/usr/data/fl302/code/n2nmn/exp_vqa/tfmodel/vqa_rl_gt_layout/00040000'
        self.vocab_question_file = os.path.join(data_root,
                                                'vocabulary_vqa.txt')
        self.vocab_layout_file = os.path.join(data_root,
                                              'vocabulary_layout.txt')
        self.vocab_answer_file = os.path.join(data_root, 'answers_vqa.txt')
        self.vocab_dict = VocabDict(self.vocab_question_file)
        self.answer_dict = VocabDict(self.vocab_answer_file)
        self.answer_word_list = self.answer_dict.word_list
        self.src_ans_id2dst = TopAnswerVersionConverter()

        self.assembler = Assembler(self.vocab_layout_file)
        num_vocab_txt = self.vocab_dict.num_vocab
        num_vocab_nmn = len(self.assembler.module_names)
        num_choices = self.answer_dict.num_vocab
        # pdb.set_trace()

        # Start the session BEFORE importing tensorflow_fold
        # to avoid taking up all GPU memory
        with tf.Graph().as_default():
            self.input_seq_batch = tf.placeholder(tf.int32, [None, None])
            self.seq_length_batch = tf.placeholder(tf.int32, [None])
            self.image_feat_batch = tf.placeholder(
                tf.float32, [None, H_feat, W_feat, D_feat])
            self.expr_validity_batch = tf.placeholder(tf.bool, [None])

            # build model
            self.nmn3_model_tst = NMN3Model(
                self.image_feat_batch,
                self.input_seq_batch,
                self.seq_length_batch,
                T_decoder=T_decoder,
                num_vocab_txt=num_vocab_txt,
                embed_dim_txt=embed_dim_txt,
                num_vocab_nmn=num_vocab_nmn,
                embed_dim_nmn=embed_dim_nmn,
                lstm_dim=lstm_dim,
                num_layers=num_layers,
                assembler=self.assembler,
                encoder_dropout=False,
                decoder_dropout=False,
                decoder_sampling=False,
                num_choices=num_choices,
                use_qpn=use_qpn,
                qpn_dropout=False,
                reduce_visfeat_dim=reduce_visfeat_dim)

            self.sess = tf.Session(
                config=tf.ConfigProto(gpu_options=tf.GPUOptions(
                    allow_growth=True),
                                      allow_soft_placement=False,
                                      log_device_placement=False))

            snapshot_saver = tf.train.Saver(
                max_to_keep=None)  # keep all snapshots
            snapshot_saver.restore(self.sess, snapshot_file)
예제 #2
0
vocab_question_file = './exp_vqa/data/vocabulary_vqa.txt'
vocab_layout_file = './exp_vqa/data/vocabulary_layout.txt'
vocab_answer_file = './exp_vqa/data/answers_vqa.txt'

imdb_file_tst = './exp_vqa/data/imdb/imdb_%s.npy' % tst_image_set

save_file = './exp_vqa/results/%s/%s.%s.txt' % (exp_name, snapshot_name,
                                                tst_image_set)
os.makedirs(os.path.dirname(save_file), exist_ok=True)
eval_output_name = 'vqa_OpenEnded_mscoco_%s_%s_%s_results.json' % (
    tst_image_set, exp_name, snapshot_name)
eval_output_file = './exp_vqa/eval_outputs/%s/%s' % (exp_name,
                                                     eval_output_name)
os.makedirs(os.path.dirname(eval_output_file), exist_ok=True)

assembler = Assembler(vocab_layout_file)

data_reader_tst = DataReader(imdb_file_tst,
                             shuffle=False,
                             one_pass=True,
                             batch_size=N,
                             T_encoder=T_encoder,
                             T_decoder=T_decoder,
                             assembler=assembler,
                             vocab_question_file=vocab_question_file,
                             vocab_answer_file=vocab_answer_file)

num_vocab_txt = data_reader_tst.batch_loader.vocab_dict.num_vocab
num_vocab_nmn = len(assembler.module_names)
num_choices = data_reader_tst.batch_loader.answer_dict.num_vocab
예제 #3
0
class N2MNWrapper(object):
    def __init__(self):
        self.T_encoder = 26
        data_root = '/usr/data/fl302/code/n2nmn/exp_vqa/data'
        snapshot_file = '/usr/data/fl302/code/n2nmn/exp_vqa/tfmodel/vqa_rl_gt_layout/00040000'
        self.vocab_question_file = os.path.join(data_root,
                                                'vocabulary_vqa.txt')
        self.vocab_layout_file = os.path.join(data_root,
                                              'vocabulary_layout.txt')
        self.vocab_answer_file = os.path.join(data_root, 'answers_vqa.txt')
        self.vocab_dict = VocabDict(self.vocab_question_file)
        self.answer_dict = VocabDict(self.vocab_answer_file)
        self.answer_word_list = self.answer_dict.word_list

        self.assembler = Assembler(self.vocab_layout_file)
        num_vocab_txt = self.vocab_dict.num_vocab
        num_vocab_nmn = len(self.assembler.module_names)
        num_choices = self.answer_dict.num_vocab
        # pdb.set_trace()

        # Start the session BEFORE importing tensorflow_fold
        # to avoid taking up all GPU memory
        with tf.Graph().as_default():
            self.input_seq_batch = tf.placeholder(tf.int32, [None, None])
            self.seq_length_batch = tf.placeholder(tf.int32, [None])
            self.image_feat_batch = tf.placeholder(
                tf.float32, [None, H_feat, W_feat, D_feat])
            self.expr_validity_batch = tf.placeholder(tf.bool, [None])

            # build model
            self.nmn3_model_tst = NMN3Model(
                self.image_feat_batch,
                self.input_seq_batch,
                self.seq_length_batch,
                T_decoder=T_decoder,
                num_vocab_txt=num_vocab_txt,
                embed_dim_txt=embed_dim_txt,
                num_vocab_nmn=num_vocab_nmn,
                embed_dim_nmn=embed_dim_nmn,
                lstm_dim=lstm_dim,
                num_layers=num_layers,
                assembler=self.assembler,
                encoder_dropout=False,
                decoder_dropout=False,
                decoder_sampling=False,
                num_choices=num_choices,
                use_qpn=use_qpn,
                qpn_dropout=False,
                reduce_visfeat_dim=reduce_visfeat_dim)

            self.sess = tf.Session(
                config=tf.ConfigProto(gpu_options=tf.GPUOptions(
                    allow_growth=True),
                                      allow_soft_placement=False,
                                      log_device_placement=False))

            snapshot_saver = tf.train.Saver(
                max_to_keep=None)  # keep all snapshots
            snapshot_saver.restore(self.sess, snapshot_file)

    def _prepare_question(self, questions):
        actual_batch_size = len(questions)
        input_seq_batch = np.zeros((self.T_encoder, actual_batch_size),
                                   np.int32)
        seq_length_batch = np.zeros(actual_batch_size, np.int32)

        for n, question in enumerate(questions):
            question_tokens = tokenize(question)
            question_inds = [
                self.vocab_dict.word2idx(w) for w in question_tokens
            ]
            seq_length = len(question_inds)
            input_seq_batch[:seq_length, n] = question_inds
            seq_length_batch[n] = seq_length
        return input_seq_batch, seq_length_batch

    def _prepare_images(self, image_id, questions):
        num_tiles = len(questions)
        FEAT_ROOT = '/usr/data/fl302/data/VQA/ResNet152/resnet_res5c'
        filename = '%s2014/COCO_%s2014_%012d.jpg' % ('val', 'val', image_id)
        f = np.load(os.path.join(FEAT_ROOT, filename + '.npz'))['x']
        f = f.transpose((1, 2, 0))[np.newaxis, ::]
        return np.tile(f, [num_tiles, 1, 1, 1])

    def inference(self, image_id, questions):
        nmn3_model_tst = self.nmn3_model_tst
        # image batch
        image_batch = self._prepare_images(image_id, questions)
        # question batch
        seq, seq_length = self._prepare_question(questions)
        # pdb.set_trace()
        # set up input and output tensors
        h = self.sess.partial_run_setup([
            nmn3_model_tst.predicted_tokens, nmn3_model_tst.scores
        ], [
            self.input_seq_batch, self.seq_length_batch, self.image_feat_batch,
            nmn3_model_tst.compiler.loom_input_tensor, self.expr_validity_batch
        ])

        # Part 0 & 1: Run Convnet and generate module layout
        tokens = self.sess.partial_run(h,
                                       nmn3_model_tst.predicted_tokens,
                                       feed_dict={
                                           self.input_seq_batch: seq,
                                           self.seq_length_batch: seq_length,
                                           self.image_feat_batch: image_batch
                                       })

        # Assemble the layout tokens into network structure
        expr_list, expr_validity_array = self.assembler.assemble(tokens)
        # Build TensorFlow Fold input for NMN
        expr_feed = nmn3_model_tst.compiler.build_feed_dict(expr_list)
        expr_feed[self.expr_validity_batch] = expr_validity_array

        # Part 2: Run NMN and learning steps
        scores_val = self.sess.partial_run(h,
                                           nmn3_model_tst.scores,
                                           feed_dict=expr_feed)
        scores_val[:, 0] = -1e10  # remove <unk> answer

        # compute accuracy
        predictions = np.argmax(scores_val, axis=1)
        scores = np.max(scores_val, axis=1)
        pred_answers = [self.answer_word_list[p] for p in predictions]
        return pred_answers, scores

    def get_score(self, image_id, question):
        pred_answers, scores = self.inference(image_id, [question])
        sc = scores[0]
        pred_ans = pred_answers[0]
        return pred_ans, sc

    def query_score(self, image_id, question, answer):
        question_new = question.replace(" 's ", "'s ")
        if question_new != question:
            print('Rephrase')
        question = question_new
        nmn3_model_tst = self.nmn3_model_tst
        # image batch
        questions = [question]
        image_batch = self._prepare_images(image_id, questions)
        # question batch
        seq, seq_length = self._prepare_question(questions)
        # pdb.set_trace()
        # set up input and output tensors
        h = self.sess.partial_run_setup([
            nmn3_model_tst.predicted_tokens, nmn3_model_tst.scores
        ], [
            self.input_seq_batch, self.seq_length_batch, self.image_feat_batch,
            nmn3_model_tst.compiler.loom_input_tensor, self.expr_validity_batch
        ])

        # Part 0 & 1: Run Convnet and generate module layout
        tokens = self.sess.partial_run(h,
                                       nmn3_model_tst.predicted_tokens,
                                       feed_dict={
                                           self.input_seq_batch: seq,
                                           self.seq_length_batch: seq_length,
                                           self.image_feat_batch: image_batch
                                       })

        # Assemble the layout tokens into network structure
        expr_list, expr_validity_array = self.assembler.assemble(tokens)
        # Build TensorFlow Fold input for NMN
        expr_feed = nmn3_model_tst.compiler.build_feed_dict(expr_list)
        expr_feed[self.expr_validity_batch] = expr_validity_array

        # Part 2: Run NMN and learning steps
        scores_val = self.sess.partial_run(h,
                                           nmn3_model_tst.scores,
                                           feed_dict=expr_feed)
        scores = scores_val.flatten()
        idx = self.answer_dict.word2idx(answer)
        return float(scores[idx])