def __init__(self): self.T_encoder = 26 data_root = '/usr/data/fl302/code/n2nmn/exp_vqa/data' snapshot_file = '/usr/data/fl302/code/n2nmn/exp_vqa/tfmodel/vqa_rl_gt_layout/00040000' self.vocab_question_file = os.path.join(data_root, 'vocabulary_vqa.txt') self.vocab_layout_file = os.path.join(data_root, 'vocabulary_layout.txt') self.vocab_answer_file = os.path.join(data_root, 'answers_vqa.txt') self.vocab_dict = VocabDict(self.vocab_question_file) self.answer_dict = VocabDict(self.vocab_answer_file) self.answer_word_list = self.answer_dict.word_list self.src_ans_id2dst = TopAnswerVersionConverter() self.assembler = Assembler(self.vocab_layout_file) num_vocab_txt = self.vocab_dict.num_vocab num_vocab_nmn = len(self.assembler.module_names) num_choices = self.answer_dict.num_vocab # pdb.set_trace() # Start the session BEFORE importing tensorflow_fold # to avoid taking up all GPU memory with tf.Graph().as_default(): self.input_seq_batch = tf.placeholder(tf.int32, [None, None]) self.seq_length_batch = tf.placeholder(tf.int32, [None]) self.image_feat_batch = tf.placeholder( tf.float32, [None, H_feat, W_feat, D_feat]) self.expr_validity_batch = tf.placeholder(tf.bool, [None]) # build model self.nmn3_model_tst = NMN3Model( self.image_feat_batch, self.input_seq_batch, self.seq_length_batch, T_decoder=T_decoder, num_vocab_txt=num_vocab_txt, embed_dim_txt=embed_dim_txt, num_vocab_nmn=num_vocab_nmn, embed_dim_nmn=embed_dim_nmn, lstm_dim=lstm_dim, num_layers=num_layers, assembler=self.assembler, encoder_dropout=False, decoder_dropout=False, decoder_sampling=False, num_choices=num_choices, use_qpn=use_qpn, qpn_dropout=False, reduce_visfeat_dim=reduce_visfeat_dim) self.sess = tf.Session( config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True), allow_soft_placement=False, log_device_placement=False)) snapshot_saver = tf.train.Saver( max_to_keep=None) # keep all snapshots snapshot_saver.restore(self.sess, snapshot_file)
vocab_question_file = './exp_vqa/data/vocabulary_vqa.txt' vocab_layout_file = './exp_vqa/data/vocabulary_layout.txt' vocab_answer_file = './exp_vqa/data/answers_vqa.txt' imdb_file_tst = './exp_vqa/data/imdb/imdb_%s.npy' % tst_image_set save_file = './exp_vqa/results/%s/%s.%s.txt' % (exp_name, snapshot_name, tst_image_set) os.makedirs(os.path.dirname(save_file), exist_ok=True) eval_output_name = 'vqa_OpenEnded_mscoco_%s_%s_%s_results.json' % ( tst_image_set, exp_name, snapshot_name) eval_output_file = './exp_vqa/eval_outputs/%s/%s' % (exp_name, eval_output_name) os.makedirs(os.path.dirname(eval_output_file), exist_ok=True) assembler = Assembler(vocab_layout_file) data_reader_tst = DataReader(imdb_file_tst, shuffle=False, one_pass=True, batch_size=N, T_encoder=T_encoder, T_decoder=T_decoder, assembler=assembler, vocab_question_file=vocab_question_file, vocab_answer_file=vocab_answer_file) num_vocab_txt = data_reader_tst.batch_loader.vocab_dict.num_vocab num_vocab_nmn = len(assembler.module_names) num_choices = data_reader_tst.batch_loader.answer_dict.num_vocab
class N2MNWrapper(object): def __init__(self): self.T_encoder = 26 data_root = '/usr/data/fl302/code/n2nmn/exp_vqa/data' snapshot_file = '/usr/data/fl302/code/n2nmn/exp_vqa/tfmodel/vqa_rl_gt_layout/00040000' self.vocab_question_file = os.path.join(data_root, 'vocabulary_vqa.txt') self.vocab_layout_file = os.path.join(data_root, 'vocabulary_layout.txt') self.vocab_answer_file = os.path.join(data_root, 'answers_vqa.txt') self.vocab_dict = VocabDict(self.vocab_question_file) self.answer_dict = VocabDict(self.vocab_answer_file) self.answer_word_list = self.answer_dict.word_list self.assembler = Assembler(self.vocab_layout_file) num_vocab_txt = self.vocab_dict.num_vocab num_vocab_nmn = len(self.assembler.module_names) num_choices = self.answer_dict.num_vocab # pdb.set_trace() # Start the session BEFORE importing tensorflow_fold # to avoid taking up all GPU memory with tf.Graph().as_default(): self.input_seq_batch = tf.placeholder(tf.int32, [None, None]) self.seq_length_batch = tf.placeholder(tf.int32, [None]) self.image_feat_batch = tf.placeholder( tf.float32, [None, H_feat, W_feat, D_feat]) self.expr_validity_batch = tf.placeholder(tf.bool, [None]) # build model self.nmn3_model_tst = NMN3Model( self.image_feat_batch, self.input_seq_batch, self.seq_length_batch, T_decoder=T_decoder, num_vocab_txt=num_vocab_txt, embed_dim_txt=embed_dim_txt, num_vocab_nmn=num_vocab_nmn, embed_dim_nmn=embed_dim_nmn, lstm_dim=lstm_dim, num_layers=num_layers, assembler=self.assembler, encoder_dropout=False, decoder_dropout=False, decoder_sampling=False, num_choices=num_choices, use_qpn=use_qpn, qpn_dropout=False, reduce_visfeat_dim=reduce_visfeat_dim) self.sess = tf.Session( config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True), allow_soft_placement=False, log_device_placement=False)) snapshot_saver = tf.train.Saver( max_to_keep=None) # keep all snapshots snapshot_saver.restore(self.sess, snapshot_file) def _prepare_question(self, questions): actual_batch_size = len(questions) input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32) seq_length_batch = np.zeros(actual_batch_size, np.int32) for n, question in enumerate(questions): question_tokens = tokenize(question) question_inds = [ self.vocab_dict.word2idx(w) for w in question_tokens ] seq_length = len(question_inds) input_seq_batch[:seq_length, n] = question_inds seq_length_batch[n] = seq_length return input_seq_batch, seq_length_batch def _prepare_images(self, image_id, questions): num_tiles = len(questions) FEAT_ROOT = '/usr/data/fl302/data/VQA/ResNet152/resnet_res5c' filename = '%s2014/COCO_%s2014_%012d.jpg' % ('val', 'val', image_id) f = np.load(os.path.join(FEAT_ROOT, filename + '.npz'))['x'] f = f.transpose((1, 2, 0))[np.newaxis, ::] return np.tile(f, [num_tiles, 1, 1, 1]) def inference(self, image_id, questions): nmn3_model_tst = self.nmn3_model_tst # image batch image_batch = self._prepare_images(image_id, questions) # question batch seq, seq_length = self._prepare_question(questions) # pdb.set_trace() # set up input and output tensors h = self.sess.partial_run_setup([ nmn3_model_tst.predicted_tokens, nmn3_model_tst.scores ], [ self.input_seq_batch, self.seq_length_batch, self.image_feat_batch, nmn3_model_tst.compiler.loom_input_tensor, self.expr_validity_batch ]) # Part 0 & 1: Run Convnet and generate module layout tokens = self.sess.partial_run(h, nmn3_model_tst.predicted_tokens, feed_dict={ self.input_seq_batch: seq, self.seq_length_batch: seq_length, self.image_feat_batch: image_batch }) # Assemble the layout tokens into network structure expr_list, expr_validity_array = self.assembler.assemble(tokens) # Build TensorFlow Fold input for NMN expr_feed = nmn3_model_tst.compiler.build_feed_dict(expr_list) expr_feed[self.expr_validity_batch] = expr_validity_array # Part 2: Run NMN and learning steps scores_val = self.sess.partial_run(h, nmn3_model_tst.scores, feed_dict=expr_feed) scores_val[:, 0] = -1e10 # remove <unk> answer # compute accuracy predictions = np.argmax(scores_val, axis=1) scores = np.max(scores_val, axis=1) pred_answers = [self.answer_word_list[p] for p in predictions] return pred_answers, scores def get_score(self, image_id, question): pred_answers, scores = self.inference(image_id, [question]) sc = scores[0] pred_ans = pred_answers[0] return pred_ans, sc def query_score(self, image_id, question, answer): question_new = question.replace(" 's ", "'s ") if question_new != question: print('Rephrase') question = question_new nmn3_model_tst = self.nmn3_model_tst # image batch questions = [question] image_batch = self._prepare_images(image_id, questions) # question batch seq, seq_length = self._prepare_question(questions) # pdb.set_trace() # set up input and output tensors h = self.sess.partial_run_setup([ nmn3_model_tst.predicted_tokens, nmn3_model_tst.scores ], [ self.input_seq_batch, self.seq_length_batch, self.image_feat_batch, nmn3_model_tst.compiler.loom_input_tensor, self.expr_validity_batch ]) # Part 0 & 1: Run Convnet and generate module layout tokens = self.sess.partial_run(h, nmn3_model_tst.predicted_tokens, feed_dict={ self.input_seq_batch: seq, self.seq_length_batch: seq_length, self.image_feat_batch: image_batch }) # Assemble the layout tokens into network structure expr_list, expr_validity_array = self.assembler.assemble(tokens) # Build TensorFlow Fold input for NMN expr_feed = nmn3_model_tst.compiler.build_feed_dict(expr_list) expr_feed[self.expr_validity_batch] = expr_validity_array # Part 2: Run NMN and learning steps scores_val = self.sess.partial_run(h, nmn3_model_tst.scores, feed_dict=expr_feed) scores = scores_val.flatten() idx = self.answer_dict.word2idx(answer) return float(scores[idx])