def __init__(self, device, NUM_FRAMES_PER_STEP=5, DETECTIONS_PER_FRAME=20): super(ModelFC, self).__init__() self.device = device self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP self.DETECTIONS_PER_FRAME = DETECTIONS_PER_FRAME self.DETECTIONS_PER_STEP = self.NUM_FRAMES_PER_STEP * self.DETECTIONS_PER_FRAME self.lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert_tokenizer.add_special_tokens({ "additional_special_tokens": [self.NULL, self.ENTITY, self.ACTION] }) self.lxmert_tokenizer.encode([self.NULL, self.ENTITY, self.ACTION]) self.NULL_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids( self.NULL) self.ENTITY_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids( self.ENTITY) self.ACTION_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids( self.ACTION) self.lxmert = LxmertModel.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert.to(device) self.VG = LxmertVGHead(self.lxmert.config, self.DETECTIONS_PER_STEP) self.VG.to(device)
def __init__(self, COCO_VAL_PATH): self.COCO_VAL_PATH = COCO_VAL_PATH self.vqa_answers = utils.get_data(VQA_URL) # load models and model components self.frcnn_cfg = utils.Config.from_pretrained( "unc-nlp/frcnn-vg-finetuned") self.frcnn_cfg.MODEL.DEVICE = "cuda" self.frcnn = GeneralizedRCNN.from_pretrained( "unc-nlp/frcnn-vg-finetuned", config=self.frcnn_cfg) self.image_preprocess = Preprocess(self.frcnn_cfg) self.lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert_vqa = LxmertForQuestionAnsweringLRP.from_pretrained( "unc-nlp/lxmert-vqa-uncased").to("cuda") self.lxmert_vqa_no_lrp = LxmertForQuestionAnswering.from_pretrained( "unc-nlp/lxmert-vqa-uncased").to("cuda") self.lxmert_vqa.eval() self.lxmert_vqa_no_lrp.eval() self.model = self.lxmert_vqa self.vqa_dataset = vqa_data.VQADataset(splits="valid")
def tokenize(self, max_length=15, candi_ans_num=5): tokenizer = LxmertTokenizer.from_pretrained('unc-nlp/lxmert-base-uncased') for entry in self.entries: q_a_text_top20 = [] question_text = entry['question'] question_type_text = entry['question_type'] ans_text_list = entry['candi_ans']['top20_text'] for ind, i in enumerate(ans_text_list): lower_question_text = question_text.lower() if question_type_text in lower_question_text : dense_caption = lower_question_text.replace(question_type_text,i)[:-1] else: dense_caption = i+" "+lower_question_text dense_caption_token_dict = tokenizer(dense_caption) qa_tokens = dense_caption_token_dict['input_ids'] if len(qa_tokens) > max_length : qa_tokens = qa_tokens[:max_length] else: padding = [tokenizer('[PAD]')['input_ids'][1:-1][0]]*(max_length - len(qa_tokens)) qa_tokens = qa_tokens + padding assert len(qa_tokens) == max_length q_a_tokens_tensor = torch.from_numpy(np.array([qa_tokens])) if ind == 0: q_a_tokens_top_20 = q_a_tokens_tensor else: q_a_tokens_top_20 = torch.cat([q_a_tokens_top_20, q_a_tokens_tensor]) entry['candi_ans']["20_qa_text"] = q_a_tokens_top_20
def __init__(self, COCO_val_path, use_lrp=False): self.COCO_VAL_PATH = COCO_val_path self.vqa_answers = utils.get_data(VQA_URL) # load models and model components self.frcnn_cfg = utils.Config.from_pretrained( "unc-nlp/frcnn-vg-finetuned") self.frcnn_cfg.MODEL.DEVICE = "cuda" self.frcnn = GeneralizedRCNN.from_pretrained( "unc-nlp/frcnn-vg-finetuned", config=self.frcnn_cfg) self.image_preprocess = Preprocess(self.frcnn_cfg) self.lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") if use_lrp: self.lxmert_vqa = LxmertForQuestionAnsweringLRP.from_pretrained( "unc-nlp/lxmert-vqa-uncased").to("cuda") else: self.lxmert_vqa = LxmertForQuestionAnswering.from_pretrained( "unc-nlp/lxmert-vqa-uncased").to("cuda") self.lxmert_vqa.eval() self.model = self.lxmert_vqa self.vqa_dataset = vqa_data.VQADataset(splits="valid") self.pert_steps = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1] self.pert_acc = [0] * len(self.pert_steps)
def __init__(self, bert_model_name: str, dataset: LXMERTBiasDataset, img_data=None): super().__init__() self.tokenizer = LxmertTokenizer.from_pretrained(bert_model_name) self.raw_dataset = dataset # Load the dataset #if img_data is None: # if feature_filepaths is not None: # img_data = [] # [img_data.extend(load_obj_tsv(fp)) for fp in feature_filepaths] # else: # img_data = [] # for source in self.raw_dataset.sources: # img_data.extend(load_obj_tsv(Split2ImgFeatPath[source], topk=None)) self.imgid2img = {} for img_datum in img_data: self.imgid2img[img_datum['img_id']] = img_datum # Filter out the dataset used_data = [] for datum in self.raw_dataset.data: if datum["image_id"] in self.imgid2img: used_data.append(datum) elif datum[ "image_id"] + '.jpg' in self.imgid2img: # TODO update img ids datum["image_id"] = datum["image_id"] + ".jpg" used_data.append(datum) else: # TODO missing images # FUNKY raise Exception() reps = [('randmother', 'grandmother'), ('hysics', 'physics'), ('asa', 'nasa'), ('randfather', 'grandfather'), ('ovel', 'novel'), ('oetry', 'poetry')] for (orig, new) in reps: if orig in datum["image_id"]: img_id = re.sub(orig, new, datum["image_id"]) datum["image_id"] = img_id break if datum["image_id"] + '.jpg' in self.imgid2img: # TODO update img ids datum["image_id"] = datum["image_id"] + ".jpg" used_data.append(datum) else: print(f'missing {datum}') # Flatten the dataset (into one sent + one image entries) self.data = [] for datum in used_data: new_datum = { 'uid': make_uid(datum['image_id'], "bias", 0), 'img_id': datum["image_id"], 'sent': datum["caption"] } self.data.append(new_datum) print("Use %d data in torch dataset" % (len(self.data)))
def __init__(self, device='cuda:0'): self.device = device # load models and model components frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg).to(device) self.image_preprocess = Preprocess(frcnn_cfg) self.lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased") self.lxmert_gqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-gqa-uncased").to(device)
def get_QA(QAdata_path): """ load question and answers. Arguments: QAdata_path -- a string that shows questions and answers file path. Return: img_ids -- a list of image ids. ques_inputs -- a list of question ids. inputs -- tokenized questions and language attention masks. targets -- labels """ data = [] for path in QAdata_path: data += json.load(open(path)) quesid2data = {d['question_id']: d for d in data} data = pd.DataFrame(data) logger.info("successfully load questions data.") img_ids = data['img_id'].values ques_ids = data['question_id'].values questions = list(data['sent'].values) labels = data['label'].values assert len(img_ids) == len(ques_ids) == len(questions) == len(labels) # Tokenize question lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") ques_inputs = lxmert_tokenizer(questions, padding="max_length", max_length=SEQ_LENGTH, truncation=True, return_attention_mask=True, add_special_tokens=True, return_tensors="tf") # Provide label (target) ans2label = json.load(open(ANS2LABELS_PATH)) num_answers = len(ans2label) targets = np.zeros((len(labels), num_answers)) for i, label in enumerate(labels): for ans, score in label.items(): targets[i, ans2label[ans]] = score logger.info("total number of img_ids is %i ." % (len(img_ids))) logger.info("total number of ques_ids is %i ." % (len(ques_ids))) logger.info("total number of ques_inputs is %s ." % (str(ques_inputs.input_ids.shape))) logger.info("total number of labels is %s ." % (str(targets.shape))) return img_ids, ques_ids, ques_inputs, targets, quesid2data
def __init__(self): self.config = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") self.cnn = GeneralizedRCNN.from_pretrained( "unc-nlp/frcnn-vg-finetuned", config=self.config ) self.image_preprocess = Preprocess(self.config) self.tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased") self.vqa = LxmertForQuestionAnswering.from_pretrained( "unc-nlp/lxmert-vqa-uncased" )
def __init__(self, device, NUM_FRAMES_PER_STEP=5, MAX_DETECTIONS=20): super(Model, self).__init__() self.device = device self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP self.MAX_DETECTIONS = MAX_DETECTIONS self.CANDIDATES = self.NUM_FRAMES_PER_STEP * self.MAX_DETECTIONS self.lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert = LxmertModel.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert = nn.DataParallel(self.lxmert) self.lxmert.to(device)
def __init__(self, config, args, num_clusters=10000): super().__init__(config) self.config = config self.args = args self.config.num_clusters = num_clusters self.config.clustering = num_clusters > 0 self.bert = LxmertModel(config) self.obj_predict_head = LxmertVisualObjHead(config) self.mask_feat = nn.Parameter(torch.zeros(config.visual_feat_dim)) self.vis_emb = None self.tokenizer = LxmertTokenizer.from_pretrained( 'unc-nlp/lxmert-base-uncased')
def __init__(self, NUM_FRAMES_PER_STEP=5, MAX_DETECTIONS=20, max_epochs=100, lr=1e-4, batch_size=4): super().__init__() self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP self.MAX_DETECTIONS = MAX_DETECTIONS self.CANDIDATES = self.NUM_FRAMES_PER_STEP * self.MAX_DETECTIONS self.lxmert_tokenizer = LxmertTokenizer.from_pretrained( "unc-nlp/lxmert-base-uncased") self.lxmert = LxmertModel.from_pretrained( "unc-nlp/lxmert-base-uncased") self.save_hyperparameters()
def __init__(self, dummy_config): super(LXMERT, self).__init__(dummy_config) frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") # self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) self.backbone, self.roi_heads = build_image_encoder() self.lxmert_vqa = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased") # self.lxmert_vqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-vqa-uncased") self.tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased") self.image_preprocess = Preprocess(frcnn_cfg) hid_dim = self.lxmert_vqa.config.hidden_size # transform = BertPredictionHeadTransform(self.config.NETWORK.VLBERT) self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim), GELU(), BertLayerNorm(hid_dim), nn.Dropout(self.config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), nn.Linear(hid_dim, self.config.NETWORK.CLASSIFIER_CLASS), )
import random from multiprocessing import Pool import h5py import pickle import math from tqdm import tqdm import torch import numpy as np from torch.utils.data import Dataset, DataLoader from torch.utils.data.distributed import DistributedSampler from transformers import LxmertTokenizer from pretrain.qa_answer_table import AnswerTable tokenizer = LxmertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True ) def text_process(sent, max_text_length=20, PAD_ID=0): tokens = tokenizer.tokenize(sent.strip()) # Account for [CLS] and [SEP] with "- 2" if len(tokens) > max_text_length - 2: tokens = tokens[:(max_text_length - 2)] tokens = ['[CLS]'] + tokens + ['[SEP]'] input_ids = tokenizer.convert_tokens_to_ids(tokens) n_tokens = len(input_ids) # # Pad up to the sequence length. # while len(input_ids) < max_text_length: # input_ids.append(PAD_ID)
def __init__(self, args, dataset: NLVR2Dataset, split, verbose, topk=-1): super().__init__() self.raw_dataset = dataset self.args = args self.verbose = verbose self.datasets_dir = Path(self.args.datasets_dir) # Assign Clusters Ids data = self.raw_dataset.data if topk > 0: data = data[:topk] if self.verbose: print(f"Use only {topk} data") if args.grid_model: self.data_source_to_h5_path = { 'train': self.datasets_dir.joinpath( f'nlvr2/features/{args.encoder}_train_v4_grid{args.grid_size}.h5' ), 'valid': self.datasets_dir.joinpath( f'nlvr2/features/{args.encoder}_valid_v4_grid{args.grid_size}.h5' ), 'test': self.datasets_dir.joinpath( f'nlvr2/features/{args.encoder}_test_v4_grid{args.grid_size}.h5' ), } else: self.data_source_to_h5_path = { 'train': self.datasets_dir.joinpath( f'nlvr2/features/maskrcnn_train_boxes36.h5'), 'valid': self.datasets_dir.joinpath( f'nlvr2/features/maskrcnn_valid_boxes36.h5'), 'test': self.datasets_dir.joinpath( f'nlvr2/features/maskrcnn_test_boxes36.h5'), } for source, path in self.data_source_to_h5_path.items(): assert path.is_file(), (source, path) self.h5_path = self.data_source_to_h5_path[split] self.h5_f = None self.data = data if verbose: print("Use %d data in torch dataset" % (len(self.data))) print() self.grid_size = args.grid_size self.n_grids = self.grid_size**2 if self.args.grid_model: self.boxes = box_position(args.grid_size) else: self.n_boxes = self.args.n_boxes self.boxes = None self.tokenizer = LxmertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) self.max_text_length = args.max_text_length
# For logging predictions, labels in Trainer compute_loss() override. from transformers.integrations import is_wandb_available from transformers import (LxmertTokenizer, LxmertForQuestionAnswering, TrainingArguments, Trainer) # Trainer compute_metrics() override for evaluation, prediction. from sklearn.metrics import (accuracy_score, precision_recall_fscore_support as score) # Required for Trainer prediction_step() override. from transformers.trainer_pt_utils import (nested_concat, nested_detach) # Instantiate tokenizer. pretrained = f"{MODEL_PTH}{args.load}" tokenizer = LxmertTokenizer.from_pretrained(f"{MODEL_PTH}lxmert-base-uncased") # Instantiate VQA object: creates datasets, provides HAT methods. vqa = VQA(tokenizer) @dataclass class VQATrainingArguments(TrainingArguments): """Required subclass to add joint loss coefficient for hyperparameter search.""" x_lmbda: Optional[float] = field( default=args.x_lmbda, metadata={"help": "VQA-HLAT loss trade-off."}) class VQATrainer(Trainer): def prediction_step(self,
# load object, attribute, and answer labels with open(OBJ_PTH) as objf: objids = objf.read().splitlines() with open(ATTR_PTH) as attf: attrids = attf.read().splitlines() vqa_answers = utils.get_data(VQA_PTH) vqa_labels = utils.get_data(LABEL_PTH) # load models and model components frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) image_preprocess = Preprocess(frcnn_cfg) tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased") def get_h_att(): """ Load VQA 2.0 HLAT files. {"pre_attmap" : attention maps for question ids (ordered)} https://github.com/qiaott/HAN for question-image pairs: train_val: 658,111 attention maps test_dev: 107,394 attention maps test: 447,793 attention maps """ trainval = h5py.File(f'{HAT_DATA}trainval2014_attention_maps.h5', 'r') test_dev = h5py.File(f'{HAT_DATA}test-dev2015_attention_maps.h5', 'r') test = h5py.File(f'{HAT_DATA}test2015_attention_maps.h5', 'r')
def __init__(self, args, train_loader=None, val_loader=None, logger=None, train=True): super().__init__() self.args = args self.max_text_length = args.max_text_length self.train_loader = train_loader self.val_loader = val_loader self.logger = logger # Build model self.model = XLxmertForPretraining.from_pretrained( "bert-base-uncased", num_clusters=args.num_clusters) self.tokenizer = LxmertTokenizer.from_pretrained( 'unc-nlp/lxmert-base-uncased') self.verbose = True if self.args.distributed: if self.args.gpu != 0: self.verbose = False if args.clustering: self.datasets_dir = Path(self.args.datasets_dir) clustering_dir = self.datasets_dir.joinpath('clustering') centroid_path = clustering_dir.joinpath( f'{args.encoder}_{args.cluster_src}_centroids{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.npy' ) centroids = np.load(centroid_path) self.model.set_visual_embedding(centroids) # Load pre-trained weights self.start_epoch = None if args.load is not None: path = args.load + '_LXRT.pth' self.load(path, verbose=self.verbose) # GPU Options print(f'Model Launching at GPU {self.args.gpu}') from time import time start = time() self.model = self.model.to(args.gpu) # Optimizer if train: self.optim, self.lr_scheduler = self.create_optimizer_and_scheduler( ) if self.args.fp16 and _use_native_amp: self.scaler = torch.cuda.amp.GradScaler() elif _use_apex: self.model, self.optim = amp.initialize(self.model, self.optim, opt_level='O1', verbosity=self.verbose) if args.multiGPU: assert args.distributed self.model = DDP(self.model, device_ids=[args.gpu], find_unused_parameters=True) if args.gpu == 0: print(f'It took {time() - start:.1f}s')
def __init__(self, split='mscoco_mininval', topk=-1, data_out=['img'], verbose=True, args=None): self.data_out = data_out self.topk = topk self.verbose = verbose self.args = args self.datasets_dir = Path(self.args.datasets_dir) # Loading datasets to data self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) self.answer_table = AnswerTable() # if self.verbose: print("Load an answer table of size %d." % (len(self.answer_table.ans2id_map()))) self.img_ids_to_source = {} data = [] for img_source in self.sources: with open(self.datasets_dir.joinpath(f'data/lxmert/{img_source}.json')) as f: _data = json.load(f) if self.verbose: print(f"Loaded {len(_data)} data from", img_source) # source_img_ids.append([d['img_id'] for d in _data]) for datum in _data: self.img_ids_to_source[datum['img_id']] = img_source datum['img_source'] = img_source datum['caption_only'] = args.caption_only datum['clustering'] = args.clustering datum['max_text_length'] = args.max_text_length datum['qa'] = args.task_qa data.extend(_data) # Modify the answers if args.task_qa: for datum in data: labelf = datum['labelf'] for _qa_source, labels in labelf.items(): for label in labels: for ans in list(label.keys()): new_ans = self.answer_table.convert_ans(ans) if self.answer_table.used(new_ans): if ans != new_ans: label[new_ans] = label.pop(ans) else: label.pop(ans) if self.topk > 0: data = data[:self.topk] if self.verbose: print(f"Use only {self.topk} data") if args.task_qa: self.evaluator = QAEvaluator(data) if args.clustering: clustering_dir = self.datasets_dir.joinpath('clustering') with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_mscoco_train_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f: mscoco_train_img_id_to_cluster_id = pickle.load(f) with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_mscoco_valid_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f: mscoco_valid_img_id_to_cluster_id = pickle.load(f) with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_vg_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f: vg_img_id_to_cluster_id = pickle.load(f) self.data_source_to_cluster_data = { 'mscoco_train': mscoco_train_img_id_to_cluster_id, 'mscoco_minival': mscoco_valid_img_id_to_cluster_id, 'mscoco_nominival': mscoco_valid_img_id_to_cluster_id, 'vgnococo': vg_img_id_to_cluster_id } with Pool(8) as pool: if self.verbose: data = [datum for _data in tqdm(pool.imap(get_datum, data), total=len(data), ncols=100) for datum in _data] else: data = [datum for _data in pool.imap(get_datum, data) for datum in _data] if self.args.target_exact_feat or self.args.feed_exact_feat or self.args.target_obj_id: if args.grid_model: self.data_source_to_h5_path = { 'mscoco_train': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_train_grid{args.grid_size}.h5'), 'mscoco_minival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'), 'mscoco_nominival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'), 'vgnococo': self.datasets_dir.joinpath(f'VG/features/{args.encoder}_grid{args.grid_size}.h5'), } else: self.data_source_to_h5_path = { 'mscoco_train': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_train_boxes36.h5'), 'mscoco_minival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'), 'mscoco_nominival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'), 'vgnococo': self.datasets_dir.joinpath(f'VG/features/maskrcnn_boxes36.h5'), } for source, path in self.data_source_to_h5_path.items(): assert path.is_file(), (source, path) self.source_to_h5 = None self.data = data if args.vis_mask_COCO_only: COCO_data = [] for datum in self.data: if datum['text_source'] == 'mscoco' and 'mscoco' in datum['img_source']: COCO_data.append(datum) self.COCO_data = COCO_data if self.verbose: print('# COCO captions:', len(self.COCO_data)) if self.verbose: if 'sent' not in self.data_out: print("# all images:", len(self.data)) else: print("# all sentences:", len(self.data)) self.grid_size = args.grid_size self.n_grids = args.n_grids if self.args.grid_model: self.boxes = box_position(args.grid_size) else: self.n_boxes = args.n_boxes self.boxes = None self.tokenizer = LxmertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True ) self.max_text_length = args.max_text_length ###### Pretrainining Objective ###### tasks = [] if self.args.task_mask_lm: tasks.append('word_mask') if self.args.task_obj_predict: tasks.append('vis_mask') if self.args.task_matched: tasks.append('matched') if self.args.task_qa: tasks.append('qa') self.tasks = tasks if self.verbose: print('data_out:', self.data_out)
def __init__(self, args, dataset: VQADataset, split, verbose, topk=-1): super().__init__() self.raw_dataset = dataset self.args = args self.verbose = verbose self.datasets_dir = Path(self.args.datasets_dir) # Loading datasets to data self.sources = split.split(',') if self.verbose: print('Data sources: ', self.sources) self.img_ids_to_source = {} data_info_dicts = [] for source in self.sources: data_info_path = self.datasets_dir.joinpath(f'data/vqa/{source}.json') with open(data_info_path) as f: _data_info_dicts = json.load(f) # source_img_ids.append([d['img_id'] for d in _data_info_dicts]) for _d in _data_info_dicts: self.img_ids_to_source[_d['img_id']] = source _d['source'] = source data_info_dicts.extend(_data_info_dicts) if self.verbose: print(f"Loaded {len(_data_info_dicts)} data from", source) # data_info_dicts = self.raw_dataset.data if topk > 0: data_info_dicts = data_info_dicts[:topk] if self.verbose: print(f"Use only {topk} data") if args.grid_model: self.data_source_to_h5_path = { 'train': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_train_grid{args.grid_size}.h5'), 'minival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'), 'nominival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'), 'test': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_test_grid{args.grid_size}.h5'), } else: self.data_source_to_h5_path = { 'train': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_train_boxes36.h5'), 'minival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'), 'nominival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'), 'test': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_test_boxes36.h5'), } for source, path in self.data_source_to_h5_path.items(): assert path.is_file(), (source, path) self.source_to_h5 = None self.data = data_info_dicts if self.verbose: print("# all sentences:", len(self.data)) self.grid_size = args.grid_size self.n_grids = args.n_grids if self.args.grid_model: self.boxes = box_position(args.grid_size) else: self.n_boxes = args.n_boxes self.boxes = None self.tokenizer = LxmertTokenizer.from_pretrained('unc-nlp/lxmert-base-uncased') self.max_text_length = args.max_text_length