def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer and gt_layout are in the data self.load_answer = ('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None) self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None) if 'load_gt_layout' in data_params: self.load_gt_layout = data_params['load_gt_layout'] # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file']) if not self.load_answer: print('imdb does not contain answers') if self.load_gt_layout: self.T_decoder = data_params['T_decoder'] self.assembler = data_params['assembler'] self.prune_filter_module = (data_params['prune_filter_module'] if 'prune_filter_module' in data_params else False) else: print('imdb does not contain ground-truth layout') # load one feature map to peek its size feats = np.load(self.imdb[0]['feature_path']) self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
def __init__(self, scene_graph_file, vocab_name_file, vocab_attr_file, max_num): print('Loading scene graph from %s' % scene_graph_file) with open(scene_graph_file) as f: self.SGs = json.load(f) print('Done') self.name_dict = text_processing.VocabDict(vocab_name_file) self.attr_dict = text_processing.VocabDict(vocab_attr_file) self.num_name = self.name_dict.num_vocab self.num_attr = self.attr_dict.num_vocab self.max_num = max_num
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer and gt_layout are in the data self.load_answer = ('answer' in self.imdb[0] and self.imdb[0]['answer'] is not None) self.load_bbox = ('bbox' in self.imdb[0] and self.imdb[0]['bbox'] is not None) self.load_gt_layout = ( ('load_gt_layout' in data_params and data_params['load_gt_layout']) and ('gt_layout_tokens' in self.imdb[0] and self.imdb[0]['gt_layout_tokens'] is not None)) # Jiangnan: image understanding self.load_question = 'load_question' in self.imdb[0] # Jiangnan: question understanding self.load_sent_percent = 'sent_percent' in self.imdb[0] # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) self.layout_dict = text_processing.VocabDict( data_params['vocab_layout_file']) if not self.load_answer: print('imdb does not contain answers') if not self.load_bbox: print('imdb does not contain bounding boxes') if self.load_gt_layout: self.T_decoder = data_params['T_decoder'] # Prune multiple filter modules by default self.prune_filter_module = (data_params['prune_filter_module'] if 'prune_filter_module' in data_params else True) else: print('imdb does not contain ground-truth layout') # load one feature map to peek its size feats = np.load(self.imdb[0]['feature_path']) self.feat_H, self.feat_W, self.feat_D = feats.shape[1:] if self.load_bbox: self.img_H = data_params['img_H'] self.img_W = data_params['img_W'] self.stride_H = self.img_H * 1. / self.feat_H self.stride_W = self.img_W * 1. / self.feat_W
def main(FLAGS): """Main function. 1. Extracts vocabularies from questions and answers. 2. Creates and saves image dialog databases for train | valid | test splits. Args: FLAGS: Command-line options. """ # Read the dataset. with open(FLAGS.json_path) as file_id: data = json.load(file_id) # Extract vocabulary and answer list. save_vocabularies(data['trainExamples'], FLAGS) # Extract mean and std of train images. save_mean_std_image(FLAGS) # Read the vocabulary files (questions | answers) and create objects vocab = text_processing.VocabDict(FLAGS.vocab_save_path) with open(FLAGS.answers_save_path, 'r') as file_id: ans_list = [ii.strip('\n') for ii in file_id.readlines()] # data splits for split in ['train', 'valid', 'test']: imdb_split = build_imdb(data, split, vocab, ans_list, FLAGS) save_path = os.path.join(FLAGS.imdb_save_path, 'imdb_%s.npy' % split) print('Saving imdb build: %s' % save_path) np.save(save_path, np.array(imdb_split))
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # peek one example to see whether bbox is in the data self.load_bbox = ('bbox' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not (self.load_answer or self.load_bbox): print('imdb has no answer labels or bbox. Using dummy labels.\n\n' '**The final accuracy will be zero (no labels provided)**\n') # positional encoding self.add_pos_enc = data_params.get('add_pos_enc', False) self.pos_enc_dim = data_params.get('pos_enc_dim', 0) assert self.pos_enc_dim % 4 == 0, \ 'positional encoding dim must be a multiply of 4' self.pos_enc_scale = data_params.get('pos_enc_scale', 1.) self.load_spatial_feature = data_params['load_spatial_feature'] if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.pos_enc = self.pos_enc_scale * get_positional_encoding( self.spatial_H, self.spatial_W, self.pos_enc_dim) if self.load_bbox: self.img_H = data_params['img_H'] self.img_W = data_params['img_W'] self.stride_H = self.img_H * 1. / self.spatial_H self.stride_W = self.img_W * 1. / self.spatial_W
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer and gt_layout are in the data self.load_answer = ('valid_answers' in self.imdb[0] and self.imdb[0]['valid_answers']) self.load_gt_layout = ( ('load_gt_layout' in data_params and data_params['load_gt_layout']) and ('gt_layout_tokens' in self.imdb[0] and self.imdb[0]['gt_layout_tokens'] is not None)) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not self.load_answer: print('imdb does not contain answers') self.T_decoder = data_params['T_decoder'] self.layout_dict = text_processing.VocabDict( data_params['vocab_layout_file']) if self.load_gt_layout: # Prune multiple filter modules by default self.prune_filter_module = (data_params['prune_filter_module'] if 'prune_filter_module' in data_params else True) else: print('imdb does not contain ground-truth layout') # Whether to load soft scores (targets for sigmoid regression) self.load_soft_score = ('load_soft_score' in data_params and data_params['load_soft_score']) # load one feature map to peek its size feats = np.load(self.imdb[0]['feature_path']) self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
def __init__(self, num_vocab, num_choices): super().__init__() if cfg.INIT_WRD_EMB_FROM_FILE: embeddingsInit = np.load(cfg.WRD_EMB_INIT_FILE) # 2956 * 300 assert embeddingsInit.shape == (num_vocab - 1, cfg.WRD_EMB_DIM) else: embeddingsInit = np.random.randn(num_vocab - 1, cfg.WRD_EMB_DIM) self.num_vocab = num_vocab # 2957 self.num_choices = num_choices # 1845 self.tokenizer = BertTokenizer.from_pretrained( '/home/xdjf/bert_config/bert-base-uncased') self.model = BertModel.from_pretrained( '/home/xdjf/bert_config/bert-base-uncased') self.name_dict = text_processing.VocabDict(cfg.VOCAB_NAME_FILE) name_embedding = self.reset_name_embedding() self.encoder = Encoder(embeddingsInit, name_embedding) self.lcgn = LCGN() #self.sema_lcgn = SemanLCGN() self.single_hop = SingleHop() self.classifier = Classifier(num_choices)
import json import os import sys; sys.path.append('../../') # NOQA from util import text_processing from collections import Counter vocab_answer_file = './answers_vqa.txt' annotation_file = '../vqa_dataset/Annotations/mscoco_%s_annotations.json' question_file = '../vqa_dataset/Questions/OpenEnded_mscoco_%s_questions.json' gt_layout_file = './gt_layout_%s_new_parse.npy' image_dir = '../coco_dataset/images/%s/' feature_dir = './resnet152_c5_7x7/%s/' answer_dict = text_processing.VocabDict(vocab_answer_file) valid_answer_set = set(answer_dict.word_list) def extract_answers(q_answers): all_answers = [answer["answer"] for answer in q_answers] valid_answers = [a for a in all_answers if a in valid_answer_set] # build soft scores soft_score_inds = [] soft_score_target = [] valid_answer_counter = Counter(valid_answers) for k, v in valid_answer_counter.items(): soft_score_inds.append(answer_dict.word2idx(k)) soft_score_target.append(min(1., v / 3.)) return all_answers, valid_answers, soft_score_inds, soft_score_target
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer and gt_layout are in the data self.load_answer = ('valid_answers' in self.imdb[0]) and (self.imdb[0]['valid_answers'] is not None) self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and ( self.imdb[0]['gt_layout_tokens'] is not None) if 'load_gt_layout' in data_params: self.load_gt_layout = data_params['load_gt_layout'] # decide whether or not to load gt textatt self.load_gt_txtatt = ('gt_txtatt' in self.imdb[0]) and (self.imdb[0]['gt_txtatt'] is not None) if 'load_gt_txtatt' in data_params: self.load_gt_txtatt = data_params['load_gt_txtatt'] # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) self.num_choices = self.answer_dict.num_vocab if not self.load_answer: print('imdb does not contain answers') else: self.load_binary_labels = ('load_binary_labels' in data_params) \ and data_params['load_binary_labels'] if self.load_binary_labels: print('loading softmax and binary classification labels.') else: print('loading softmax labels (but not binary labels).') # if 'overriding_layout' is set in data_params, force self.load_gt_layout to True # and overrides the ground-truth layout self.overriding_layout = None if 'overriding_layout' in data_params: print('"overriding_layout" key is set in data_params') print('overriding all layout with:', data_params['overriding_layout']) self.load_gt_layout = True self.load_gt_txtatt = False self.overriding_layout = data_params['overriding_layout'] if self.load_gt_layout: self.T_decoder = data_params['T_decoder'] self.assembler = data_params['assembler'] # self.prune_filter_module = (data_params['prune_filter_module'] # if 'prune_filter_module' in data_params # else False) else: print( 'imdb does not contain ground-truth layout, and "overriding_layout" key is not set' ) if 'use_count_module' in data_params and data_params[ 'use_count_module']: print( 'Use Count module: all "how many" questions will use Count for answer' ) self.use_count_module = True else: print('Not using Count module') self.use_count_module = False # load one feature map to peek its size feats = np.load(self.imdb[0]['feature_path']) self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
def __init__(self, imdb, params): """Initialize by reading the data and pre-processing it. """ self.imdb = imdb self.params = params self.num_inst = len(self.imdb['data']) self.num_rounds = len(self.imdb['data'][0]['question_ind']) # load vocabulary vocab_path = params['text_vocab_path'] self.vocab_dict = text_processing.VocabDict(vocab_path) self.T_encoder = params['max_enc_len'] # record special token ids self.start_token_id = self.vocab_dict.word2idx('<start>') self.end_token_id = self.vocab_dict.word2idx('<end>') self.pad_token_id = self.vocab_dict.word2idx('<pad>') # Load answers with open(params['args']['answer_list_path'], 'r') as file_id: choices = [ii.strip('\n') for ii in file_id.readlines()] self.num_choices = len(choices) self.choices2ind = {ii: index for index, ii in enumerate(choices)} self.ind2choices = {index: ii for index, ii in enumerate(choices)} # peek one example to see whether answer and gt_layout are in the data test_data = self.imdb['data'][0] self.load_gt_layout = test_data.get('gt_layout_tokens', False) if 'load_gt_layout' in params: self.load_gt_layout = params['load_gt_layout'] if self.load_gt_layout: self.T_decoder = params['max_dec_len'] self.assembler = params['assembler'] # load the mean of the images load_path = params['path'].split('/')[:-1] + ['train_image_mean.npy'] load_path = '/'.join(load_path) print('Loading training image stats from: ' + load_path) img_stats = np.load(load_path)[()] mean_img = img_stats['mean_img'].reshape([1, 1, -1]) std_img = img_stats['std_img'].reshape([1, 1, -1]) # read all the images images = {} print('Reading images..') #TODO: Change this back! for datum in progressbar(self.imdb['data'][::3]): img_path = datum['image_path'] if img_path not in images: cur_img = support.load_image(img_path) cur_img = (cur_img - mean_img) / std_img images[img_path] = cur_img self.images = images # get the shape from random image for _, sample in self.images.items(): self.img_size = sample.shape break # convert to tokens self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x] # use history if needed by the program generator self.use_history = self.params['generator'] == 'mem' if self.use_history: self._construct_history() # if fact is to be used if self.params['use_fact']: self._construct_fact()
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] self.N_encoder = data_params['N_encoder'] self.O_encoder = data_params['O_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not self.load_answer: print('imdb has no answer labels. Using dummy labels.\n\n' '**The final accuracy will be zero (no labels provided)**\n') #self.nlp = spacy.load('en_core_web_lg') # positional encoding self.add_pos_enc = data_params.get('add_pos_enc', False) self.pos_enc_dim = data_params.get('pos_enc_dim', 0) assert self.pos_enc_dim % 4 == 0, \ 'positional encoding dim must be a multiply of 4' self.pos_enc_scale = data_params.get('pos_enc_scale', 1.) self.load_spatial_feature = False self.load_objects_feature = False self.load_scene_graph_feature = True feature_type = data_params['feature_type'] if feature_type == 'spatial': self.load_spatial_feature = True elif feature_type == 'objects': self.load_objects_feature = True elif feature_type == 'scene_graph': self.load_scene_graph_feature = True else: raise ValueError('Unknown feature type: %s' % feature_type) if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.pos_enc = self.pos_enc_scale * get_positional_encoding( self.spatial_H, self.spatial_W, self.pos_enc_dim) if self.load_objects_feature: objects_feature_dir = data_params['objects_feature_dir'] self.objects_loader = ObjectsFeatureLoader(objects_feature_dir) # load one feature map to peek its size self.objects_M = data_params.get('objects_max_num', 100) x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId']) _, self.objects_D = x.shape if self.load_scene_graph_feature: scene_graph_file = data_params['scene_graph_file'] vocab_name_file = data_params['vocab_name_file'] vocab_attr_file = data_params['vocab_attr_file'] self.objects_M = data_params.get('objects_max_num', 100) self.scene_graph_loader = SceneGraphFeatureLoader( scene_graph_file, vocab_name_file, vocab_attr_file, max_num=self.objects_M) if feature_type == 'scene_graph': # load one feature map to peek its size x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox( self.imdb[0]['imageId']) _, self.objects_D = x.shape else: self.load_scene_graph_feature = False self.se_max_len = -1 self.se_zero_len = 0 self.se_count = Counter() self.stop_words = ['of', 'the', 'to', 'on', 'in', 'at', 'a', 'and']
def __init__(self, imdb, params): """Initialize by reading the data and pre-processing it. """ self.imdb = imdb self.params = params self.fetch_options = self.params.get('fetch_options', False) self.preload_features = params['preload_features'] self.num_inst = len(self.imdb['data']) self.num_rounds = len(self.imdb['data'][0]['question_ind']) # check if vgg features are to be used self.use_vgg = 'vgg' in self.params['feature_path'] # load vocabulary vocab_path = params['text_vocab_path'] self.vocab_dict = text_processing.VocabDict(vocab_path) self.T_encoder = params['max_enc_len'] # record special token ids self.start_token_id = self.vocab_dict.word2idx('<start>') self.end_token_id = self.vocab_dict.word2idx('<end>') self.pad_token_id = self.vocab_dict.word2idx('<pad>') # peek one example to see whether answer and gt_layout are in the data test_data = self.imdb['data'][0] self.load_gt_layout = test_data.get('gt_layout_tokens', False) if 'load_gt_layout' in params: self.load_gt_layout = params['load_gt_layout'] # decide whether or not to load gt textatt self.supervise_attention = params['supervise_attention'] self.T_decoder = params['max_dec_len'] self.assembler = params['assembler'] # load one feature map to peek its size feats = np.load(self._adjust_image_dir(test_data['feature_path'])) self.feat_H, self.feat_W, self.feat_D = feats.shape[1:] # convert to tokens self.digitizer = lambda x: [self.vocab_dict.word2idx(w) for w in x] if 'prog' in self.params['model']: # preload features if self.preload_features: img_paths = set([ii['feature_path'] for ii in self.imdb['data']]) self.img_feats = {ii:np.load(ii) for ii in progressbar(img_paths)} # if VGG is to be used if self.use_vgg: # inform the dataloader to use self.img_feats self.preload_features = True img_paths = set([ii['feature_path'] for ii in self.imdb['data']]) # first read the index file index_file = os.path.join(self.params['input_img'], 'img_id.json') with open(index_file, 'r') as file_id: index_data = json.load(file_id) # get the split -- either train / val for ii in img_paths: break split = ii.split('/')[-2][:-4] # read the features for that particular split self.img_index = {img_id: index for index, img_id in enumerate(index_data[split])} feature_file = os.path.join(self.params['input_img'], 'data_img_%s.h5' % split) key = 'images_test' if split == 'val' else 'images_train' self.img_feats = h5py.File(feature_file)[key] # check if all the images in img_paths are in img_index count = 0 for ii in img_paths: img_id = '/'.join(ii.split('/')[-2:]) if img_id.replace('npy', 'jpg') not in self.img_index: count += 1 print('Missing: %d image features' % count) # adjust the feature sizes self.feat_H, self.feat_W, self.feat_D = self.img_feats.shape[1:] self.zero_feature = np.zeros((1,) + self.img_feats.shape[1:]) # use history if needed by the program generator self.use_history = self.params['generator'] == 'mem' if self.use_history: self._construct_history() # if fact is to be used if self.params['use_fact']: self._construct_fact()
def __init__(self, imdb, data_params): self.imdb = imdb self.data_params = data_params self.vocab_dict = text_processing.VocabDict( data_params['vocab_question_file']) self.T_encoder = data_params['T_encoder'] # peek one example to see whether answer is in the data self.load_answer = ('answer' in self.imdb[0]) # the answer dict is always loaded, regardless of self.load_answer self.answer_dict = text_processing.VocabDict( data_params['vocab_answer_file']) if not self.load_answer: print('imdb does not contain answers') self.load_spatial_feature = False self.load_objects_feature = False self.load_scene_graph_feature = False feature_type = data_params['feature_type'] if feature_type == 'spatial': self.load_spatial_feature = True elif feature_type == 'objects': self.load_objects_feature = True elif feature_type == 'scene_graph': self.load_scene_graph_feature = True else: raise ValueError('Unknown feature type: %s' % feature_type) if self.load_spatial_feature: spatial_feature_dir = data_params['spatial_feature_dir'] self.spatial_loader = SpatialFeatureLoader(spatial_feature_dir) # load one feature map to peek its size x = self.spatial_loader.load_feature(self.imdb[0]['imageId']) self.spatial_D, self.spatial_H, self.spatial_W = x.shape # positional encoding self.spatial_pos_enc_dim = data_params['spatial_pos_enc_dim'] self.pos_enc = get_positional_encoding(self.spatial_H, self.spatial_W, self.spatial_pos_enc_dim) if self.load_objects_feature: objects_feature_dir = data_params['objects_feature_dir'] self.objects_M = data_params['objects_max_num'] self.objects_loader = ObjectsFeatureLoader(objects_feature_dir) # load one feature map to peek its size x, _ = self.objects_loader.load_feature(self.imdb[0]['imageId']) _, self.objects_D = x.shape self.bbox_tile_num = data_params['bbox_tile_num'] if self.load_scene_graph_feature: scene_graph_file = data_params['scene_graph_file'] vocab_name_file = data_params['vocab_name_file'] vocab_attr_file = data_params['vocab_attr_file'] self.objects_M = data_params['objects_max_num'] self.scene_graph_loader = SceneGraphFeatureLoader( scene_graph_file, vocab_name_file, vocab_attr_file, max_num=self.objects_M) # load one feature map to peek its size x, _, _ = self.scene_graph_loader.load_feature_normalized_bbox( self.imdb[0]['imageId']) _, self.objects_D = x.shape self.bbox_tile_num = data_params['bbox_tile_num']
def build_imdb(FLAGS): """Method to construct and save the image-database for the dataset """ print('Building imdb for visdial split: %s' % FLAGS.visdial_file) qid2layout_dict = np.load(FLAGS.ques_prog_file)[()] ques_att_file = FLAGS.ques_prog_file.replace('.layout', '.attention') ques_prog_att = np.load(ques_att_file)[()] cap_progs = np.load(FLAGS.cap_prog_file)[()] cap_att_file = FLAGS.cap_prog_file.replace('.layout', '.attention') cap_prog_att = np.load(cap_att_file)[()] vocab = text_processing.VocabDict(FLAGS.vocab_file) # load the data with open(FLAGS.visdial_file, 'r') as file_id: vd_data = json.load(file_id) # load the reference data with open(FLAGS.coreference_file, 'r') as file_id: references = json.load(file_id) references = references['data']['dialogs'] # coco_name = img_split + '2014' # img_root = os.path.abspath(image_dir % coco_name) # feat_root = os.path.abspath(feature_dir % coco_name) # img_name_format = 'COCO_' + coco_name + '_%012d' # process and tokenize all questions and answers tokenizer = lambda x, suff: [vocab.word2idx(ii) for ii in word_tokenize(clean.clean_non_ascii(x + suff))] print('Tokenizing captions') caption_list = [ii['caption'] for ii in vd_data['data']['dialogs']] clean_cap = [tokenizer(cap, '') for cap in progressbar(caption_list)] max_cap_len = max([len(ii) for ii in clean_cap]) cap_tokens = np.zeros((len(clean_cap), max_cap_len)).astype('int32') cap_tokens.fill(vocab.word2idx('<pad>')) cap_lens = np.zeros(len(clean_cap)).astype('int32') for q_id, tokens in progressbar(enumerate(clean_cap)): cap_lens[q_id] = len(tokens) cap_tokens[q_id, :cap_lens[q_id]] = np.array(tokens) print('Tokenizing questions') question_list = vd_data['data']['questions'] clean_ques = [tokenizer(ques, '?') for ques in progressbar(question_list)] max_ques_len = max([len(ii) for ii in clean_ques]) ques_tokens = np.zeros((len(clean_ques), max_ques_len)).astype('int32') ques_tokens.fill(vocab.word2idx('<pad>')) ques_lens = np.zeros(len(clean_ques)).astype('int32') for q_id, tokens in progressbar(enumerate(clean_ques)): ques_lens[q_id] = len(tokens) ques_tokens[q_id, :ques_lens[q_id]] = np.array(tokens) print('Tokenizing answers') answer_list = vd_data['data']['answers'] clean_ans = [tokenizer(ans, '') for ans in progressbar(answer_list)] max_ans_len = max([len(ii) for ii in clean_ans]) ans_tokens = np.zeros((len(clean_ans), max_ans_len)).astype('int32') ans_tokens.fill(vocab.word2idx('<pad>')) ans_lens = np.zeros(len(clean_ans)).astype('int32') ans_in = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32') ans_out = np.zeros((len(clean_ans), max_ans_len + 1)).astype('int32') ans_in.fill(vocab.word2idx('<pad>')) ans_out.fill(vocab.word2idx('<pad>')) start_token_id = vocab.word2idx('<start>') end_token_id = vocab.word2idx('<end>') ans_in[:, 0] = start_token_id for a_id, tokens in progressbar(enumerate(clean_ans)): ans_lens[a_id] = len(tokens) answer = np.array(tokens) ans_tokens[a_id, :ans_lens[a_id]] = answer ans_in[a_id, 1:ans_lens[a_id]+1] = answer ans_out[a_id, :ans_lens[a_id]] = answer ans_out[a_id, ans_lens[a_id]] = end_token_id ans_lens += 1 imdb = {} # number of entries in the database num_dialogs = len(vd_data['data']['dialogs']) imdb['data'] = [None] * num_dialogs imdb['ans'], imdb['ans_len'] = ans_tokens, ans_lens imdb['ans_in'], imdb['ans_out'] = ans_in, ans_out imdb['ques'], imdb['ques_len'] = ques_tokens, ques_lens imdb['cap'], imdb['cap_len'] = cap_tokens, cap_lens imdb['cap_prog'], imdb['cap_prog_att'] = cap_progs, np.array(cap_prog_att) for dialog_id, datum in progressbar(enumerate(vd_data['data']['dialogs'])): img_id = datum['image_id'] img_path = FLAGS.image_path_format % img_id feat_path = FLAGS.feature_path % img_id # compact bundle with all the information bundle = {'image_name': img_id, 'image_path': img_path, 'feature_path': feat_path, 'caption_ind': dialog_id, 'question_id': [], 'question_ind': [], 'answer_ind': [], 'option_ind': [], 'gt_ind' : [], 'gt_layout_tokens': [], 'gt_layout_att': []} # reference datum refer_datum = references[dialog_id] assert(refer_datum['image_id'] == img_id) # for each cluster, get the first mention clusters = {} caption_clusters = (refer_datum['caption_reference_clusters'] + refer_datum['caption_coref_clusters']) for ii in caption_clusters: c_id = ii['cluster_id'] clusters[c_id] = clusters.get(c_id, 'c') # each round for r_id in range(10): # assuming 10 rounds for now referrer = refer_datum['dialog'][r_id] for ii in referrer['question_reference_clusters']: c_id = ii['cluster_id'] clusters[c_id] = clusters.get(c_id, 'q%d' % r_id) for ii in referrer['answer_reference_clusters']: c_id = ii['cluster_id'] # to distinguish answer clusters[c_id] = clusters.get(c_id, 'a%d' % r_id) # bundle as questions in a conversation together num_refers = 0 for r_id, round_data in enumerate(datum['dialog']): q_id = img_id * 10 + r_id bundle['question_id'].append(q_id) bundle['question_ind'].append(round_data['question']) bundle['answer_ind'].append(round_data['answer']) bundle['option_ind'].append(round_data['answer_options']) bundle['gt_ind'].append(round_data['gt_index']) # gt attention for parsed layout attention = np.array(ques_prog_att[round_data['question']]) # check if references is non-empty and replace with _Refer layout = copy.deepcopy(list(qid2layout_dict[q_id])) referrer = refer_datum['dialog'][r_id]['question_referrer_clusters'] if len(referrer) > 0: refer = referrer[0] # pick _Find module with max attention overlap max_overlap = (0, 0) for pos, token in enumerate(layout): if token == '_Find': start = max(attention[pos][0], refer['start_word']) end = min(attention[pos][1], refer['end_word']) overlap = min(0, end - start) if max_overlap[1] < overlap: max_overlap = (pos, overlap) # reset it to _Refer pos, _ = max_overlap layout[pos] = '_Refer' attention[pos] = [refer['start_word'], refer['end_word']] # get that cluster id, and corresponding history attention num_refers += 1 bundle['gt_layout_tokens'].append(layout) # check for the words attending to ques_tokens = imdb['ques'][round_data['question']] ques_words = [vocab.idx2word(ii) for ii in ques_tokens] for index, pos in enumerate(attention): # if single word, 'the', 'a', 'of', 'you' try: if (pos[1] - pos[0]) == 1 and ques_words[pos[0]] in stop_words: attention[index] = [0, 0] except: pdb.set_trace() bundle['gt_layout_att'].append(attention) # record imdb['data'][dialog_id] = bundle return imdb