Exemplo n.º 1
0
    def __init__(self, device, NUM_FRAMES_PER_STEP=5, DETECTIONS_PER_FRAME=20):
        super(ModelFC, self).__init__()

        self.device = device

        self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP
        self.DETECTIONS_PER_FRAME = DETECTIONS_PER_FRAME
        self.DETECTIONS_PER_STEP = self.NUM_FRAMES_PER_STEP * self.DETECTIONS_PER_FRAME

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained(
            "unc-nlp/lxmert-base-uncased")
        self.lxmert_tokenizer.add_special_tokens({
            "additional_special_tokens": [self.NULL, self.ENTITY, self.ACTION]
        })
        self.lxmert_tokenizer.encode([self.NULL, self.ENTITY, self.ACTION])

        self.NULL_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(
            self.NULL)
        self.ENTITY_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(
            self.ENTITY)
        self.ACTION_TOKEN = self.lxmert_tokenizer.convert_tokens_to_ids(
            self.ACTION)

        self.lxmert = LxmertModel.from_pretrained(
            "unc-nlp/lxmert-base-uncased")
        self.lxmert.to(device)

        self.VG = LxmertVGHead(self.lxmert.config, self.DETECTIONS_PER_STEP)
        self.VG.to(device)
    def __init__(self, COCO_VAL_PATH):
        self.COCO_VAL_PATH = COCO_VAL_PATH
        self.vqa_answers = utils.get_data(VQA_URL)

        # load models and model components
        self.frcnn_cfg = utils.Config.from_pretrained(
            "unc-nlp/frcnn-vg-finetuned")
        self.frcnn_cfg.MODEL.DEVICE = "cuda"

        self.frcnn = GeneralizedRCNN.from_pretrained(
            "unc-nlp/frcnn-vg-finetuned", config=self.frcnn_cfg)

        self.image_preprocess = Preprocess(self.frcnn_cfg)

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained(
            "unc-nlp/lxmert-base-uncased")

        self.lxmert_vqa = LxmertForQuestionAnsweringLRP.from_pretrained(
            "unc-nlp/lxmert-vqa-uncased").to("cuda")
        self.lxmert_vqa_no_lrp = LxmertForQuestionAnswering.from_pretrained(
            "unc-nlp/lxmert-vqa-uncased").to("cuda")

        self.lxmert_vqa.eval()
        self.lxmert_vqa_no_lrp.eval()
        self.model = self.lxmert_vqa

        self.vqa_dataset = vqa_data.VQADataset(splits="valid")
Exemplo n.º 3
0
 def tokenize(self, max_length=15, candi_ans_num=5):
     tokenizer = LxmertTokenizer.from_pretrained('unc-nlp/lxmert-base-uncased')
     for entry in self.entries:
         q_a_text_top20 = []
         question_text = entry['question']
         question_type_text = entry['question_type']
         ans_text_list = entry['candi_ans']['top20_text']
         for ind, i in enumerate(ans_text_list):
             lower_question_text = question_text.lower()
             if question_type_text in lower_question_text :
                 dense_caption = lower_question_text.replace(question_type_text,i)[:-1]
             else:
                 dense_caption = i+" "+lower_question_text
             dense_caption_token_dict = tokenizer(dense_caption)
             qa_tokens = dense_caption_token_dict['input_ids']
             if len(qa_tokens) > max_length :
                 qa_tokens = qa_tokens[:max_length]
             else:
                 padding = [tokenizer('[PAD]')['input_ids'][1:-1][0]]*(max_length - len(qa_tokens))
                 qa_tokens = qa_tokens + padding
             assert len(qa_tokens) == max_length 
             q_a_tokens_tensor = torch.from_numpy(np.array([qa_tokens]))
             if ind == 0:
                 q_a_tokens_top_20 = q_a_tokens_tensor
             else:
                 q_a_tokens_top_20 = torch.cat([q_a_tokens_top_20, q_a_tokens_tensor])
         entry['candi_ans']["20_qa_text"] = q_a_tokens_top_20
Exemplo n.º 4
0
    def __init__(self, COCO_val_path, use_lrp=False):
        self.COCO_VAL_PATH = COCO_val_path
        self.vqa_answers = utils.get_data(VQA_URL)

        # load models and model components
        self.frcnn_cfg = utils.Config.from_pretrained(
            "unc-nlp/frcnn-vg-finetuned")
        self.frcnn_cfg.MODEL.DEVICE = "cuda"

        self.frcnn = GeneralizedRCNN.from_pretrained(
            "unc-nlp/frcnn-vg-finetuned", config=self.frcnn_cfg)

        self.image_preprocess = Preprocess(self.frcnn_cfg)

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained(
            "unc-nlp/lxmert-base-uncased")

        if use_lrp:
            self.lxmert_vqa = LxmertForQuestionAnsweringLRP.from_pretrained(
                "unc-nlp/lxmert-vqa-uncased").to("cuda")
        else:
            self.lxmert_vqa = LxmertForQuestionAnswering.from_pretrained(
                "unc-nlp/lxmert-vqa-uncased").to("cuda")

        self.lxmert_vqa.eval()
        self.model = self.lxmert_vqa

        self.vqa_dataset = vqa_data.VQADataset(splits="valid")

        self.pert_steps = [0, 0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
        self.pert_acc = [0] * len(self.pert_steps)
Exemplo n.º 5
0
    def __init__(self,
                 bert_model_name: str,
                 dataset: LXMERTBiasDataset,
                 img_data=None):
        super().__init__()
        self.tokenizer = LxmertTokenizer.from_pretrained(bert_model_name)
        self.raw_dataset = dataset
        # Load the dataset
        #if img_data is None:
        #    if feature_filepaths is not None:
        #        img_data = []
        #        [img_data.extend(load_obj_tsv(fp)) for fp in feature_filepaths]
        #    else:
        #        img_data = []
        #        for source in self.raw_dataset.sources:
        #            img_data.extend(load_obj_tsv(Split2ImgFeatPath[source], topk=None))

        self.imgid2img = {}
        for img_datum in img_data:
            self.imgid2img[img_datum['img_id']] = img_datum

        # Filter out the dataset
        used_data = []
        for datum in self.raw_dataset.data:
            if datum["image_id"] in self.imgid2img:
                used_data.append(datum)
            elif datum[
                    "image_id"] + '.jpg' in self.imgid2img:  # TODO update img ids
                datum["image_id"] = datum["image_id"] + ".jpg"
                used_data.append(datum)
            else:  # TODO missing images
                # FUNKY
                raise Exception()
                reps = [('randmother', 'grandmother'), ('hysics', 'physics'),
                        ('asa', 'nasa'), ('randfather', 'grandfather'),
                        ('ovel', 'novel'), ('oetry', 'poetry')]
                for (orig, new) in reps:
                    if orig in datum["image_id"]:
                        img_id = re.sub(orig, new, datum["image_id"])
                        datum["image_id"] = img_id
                        break

                if datum["image_id"] + '.jpg' in self.imgid2img:  # TODO update img ids
                    datum["image_id"] = datum["image_id"] + ".jpg"
                    used_data.append(datum)
                else:
                    print(f'missing {datum}')

        # Flatten the dataset (into one sent + one image entries)
        self.data = []
        for datum in used_data:
            new_datum = {
                'uid': make_uid(datum['image_id'], "bias", 0),
                'img_id': datum["image_id"],
                'sent': datum["caption"]
            }
            self.data.append(new_datum)

        print("Use %d data in torch dataset" % (len(self.data)))
Exemplo n.º 6
0
    def __init__(self, device='cuda:0'):
        self.device = device
        # load models and model components
        frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg).to(device)

        self.image_preprocess = Preprocess(frcnn_cfg)
        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.lxmert_gqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-gqa-uncased").to(device)
Exemplo n.º 7
0
def get_QA(QAdata_path):
    """
    load question and answers.

    Arguments:
    QAdata_path -- a string that shows questions and answers file path.

    Return:
    img_ids -- a list of image ids.
    ques_inputs -- a list of question ids.
    inputs -- tokenized questions and language attention masks.
    targets -- labels

    """
    data = []
    for path in QAdata_path:
        data += json.load(open(path))

    quesid2data = {d['question_id']: d for d in data}

    data = pd.DataFrame(data)
    logger.info("successfully load questions data.")

    img_ids = data['img_id'].values
    ques_ids = data['question_id'].values
    questions = list(data['sent'].values)
    labels = data['label'].values
    assert len(img_ids) == len(ques_ids) == len(questions) == len(labels)

    # Tokenize question
    lxmert_tokenizer = LxmertTokenizer.from_pretrained(
        "unc-nlp/lxmert-base-uncased")

    ques_inputs = lxmert_tokenizer(questions,
                                   padding="max_length",
                                   max_length=SEQ_LENGTH,
                                   truncation=True,
                                   return_attention_mask=True,
                                   add_special_tokens=True,
                                   return_tensors="tf")
    # Provide label (target)
    ans2label = json.load(open(ANS2LABELS_PATH))
    num_answers = len(ans2label)

    targets = np.zeros((len(labels), num_answers))
    for i, label in enumerate(labels):
        for ans, score in label.items():
            targets[i, ans2label[ans]] = score

    logger.info("total number of img_ids is %i ." % (len(img_ids)))
    logger.info("total number of ques_ids is %i ." % (len(ques_ids)))
    logger.info("total number of ques_inputs is %s ." %
                (str(ques_inputs.input_ids.shape)))
    logger.info("total number of labels is %s ." % (str(targets.shape)))

    return img_ids, ques_ids, ques_inputs, targets, quesid2data
Exemplo n.º 8
0
    def __init__(self):
        self.config = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")

        self.cnn = GeneralizedRCNN.from_pretrained(
            "unc-nlp/frcnn-vg-finetuned", config=self.config
        )

        self.image_preprocess = Preprocess(self.config)

        self.tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.vqa = LxmertForQuestionAnswering.from_pretrained(
            "unc-nlp/lxmert-vqa-uncased"
        )
Exemplo n.º 9
0
    def __init__(self, device, NUM_FRAMES_PER_STEP=5, MAX_DETECTIONS=20):
        super(Model, self).__init__()

        self.device = device

        self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP
        self.MAX_DETECTIONS = MAX_DETECTIONS
        self.CANDIDATES = self.NUM_FRAMES_PER_STEP * self.MAX_DETECTIONS

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained(
            "unc-nlp/lxmert-base-uncased")

        self.lxmert = LxmertModel.from_pretrained(
            "unc-nlp/lxmert-base-uncased")
        self.lxmert = nn.DataParallel(self.lxmert)
        self.lxmert.to(device)
Exemplo n.º 10
0
    def __init__(self, config, args, num_clusters=10000):
        super().__init__(config)

        self.config = config
        self.args = args
        self.config.num_clusters = num_clusters
        self.config.clustering = num_clusters > 0

        self.bert = LxmertModel(config)

        self.obj_predict_head = LxmertVisualObjHead(config)

        self.mask_feat = nn.Parameter(torch.zeros(config.visual_feat_dim))
        self.vis_emb = None

        self.tokenizer = LxmertTokenizer.from_pretrained(
            'unc-nlp/lxmert-base-uncased')
Exemplo n.º 11
0
    def __init__(self,
                 NUM_FRAMES_PER_STEP=5,
                 MAX_DETECTIONS=20,
                 max_epochs=100,
                 lr=1e-4,
                 batch_size=4):
        super().__init__()

        self.NUM_FRAMES_PER_STEP = NUM_FRAMES_PER_STEP
        self.MAX_DETECTIONS = MAX_DETECTIONS
        self.CANDIDATES = self.NUM_FRAMES_PER_STEP * self.MAX_DETECTIONS

        self.lxmert_tokenizer = LxmertTokenizer.from_pretrained(
            "unc-nlp/lxmert-base-uncased")
        self.lxmert = LxmertModel.from_pretrained(
            "unc-nlp/lxmert-base-uncased")

        self.save_hyperparameters()
Exemplo n.º 12
0
    def __init__(self, dummy_config):
        super(LXMERT, self).__init__(dummy_config)
        
        frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
        # self.frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)
        self.backbone, self.roi_heads = build_image_encoder()
        self.lxmert_vqa = LxmertForPreTraining.from_pretrained("unc-nlp/lxmert-base-uncased")
        # self.lxmert_vqa = LxmertForQuestionAnswering.from_pretrained("unc-nlp/lxmert-vqa-uncased")
        self.tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")
        self.image_preprocess = Preprocess(frcnn_cfg)
        
        hid_dim = self.lxmert_vqa.config.hidden_size
        # transform = BertPredictionHeadTransform(self.config.NETWORK.VLBERT)

        self.logit_fc = nn.Sequential(
            nn.Linear(hid_dim, hid_dim),
            GELU(),
            BertLayerNorm(hid_dim),
            nn.Dropout(self.config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
            nn.Linear(hid_dim, self.config.NETWORK.CLASSIFIER_CLASS),
        )
Exemplo n.º 13
0
import random
from multiprocessing import Pool
import h5py
import pickle
import math
from tqdm import tqdm
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler

from transformers import LxmertTokenizer
from pretrain.qa_answer_table import AnswerTable

tokenizer = LxmertTokenizer.from_pretrained(
    "bert-base-uncased",
    do_lower_case=True
)

def text_process(sent, max_text_length=20, PAD_ID=0):
    tokens = tokenizer.tokenize(sent.strip())

    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens) > max_text_length - 2:
        tokens = tokens[:(max_text_length - 2)]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    n_tokens = len(input_ids)

    # # Pad up to the sequence length.
    # while len(input_ids) < max_text_length:
    #     input_ids.append(PAD_ID)
Exemplo n.º 14
0
    def __init__(self, args, dataset: NLVR2Dataset, split, verbose, topk=-1):
        super().__init__()
        self.raw_dataset = dataset
        self.args = args
        self.verbose = verbose

        self.datasets_dir = Path(self.args.datasets_dir)

        # Assign Clusters Ids
        data = self.raw_dataset.data

        if topk > 0:
            data = data[:topk]
            if self.verbose:
                print(f"Use only {topk} data")

        if args.grid_model:
            self.data_source_to_h5_path = {
                'train':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/{args.encoder}_train_v4_grid{args.grid_size}.h5'
                ),
                'valid':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/{args.encoder}_valid_v4_grid{args.grid_size}.h5'
                ),
                'test':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/{args.encoder}_test_v4_grid{args.grid_size}.h5'
                ),
            }
        else:
            self.data_source_to_h5_path = {
                'train':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/maskrcnn_train_boxes36.h5'),
                'valid':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/maskrcnn_valid_boxes36.h5'),
                'test':
                self.datasets_dir.joinpath(
                    f'nlvr2/features/maskrcnn_test_boxes36.h5'),
            }

        for source, path in self.data_source_to_h5_path.items():
            assert path.is_file(), (source, path)

        self.h5_path = self.data_source_to_h5_path[split]
        self.h5_f = None

        self.data = data

        if verbose:
            print("Use %d data in torch dataset" % (len(self.data)))
            print()

        self.grid_size = args.grid_size
        self.n_grids = self.grid_size**2
        if self.args.grid_model:
            self.boxes = box_position(args.grid_size)
        else:
            self.n_boxes = self.args.n_boxes
            self.boxes = None

        self.tokenizer = LxmertTokenizer.from_pretrained("bert-base-uncased",
                                                         do_lower_case=True)
        self.max_text_length = args.max_text_length
Exemplo n.º 15
0
# For logging predictions, labels in Trainer compute_loss() override.
from transformers.integrations import is_wandb_available

from transformers import (LxmertTokenizer, LxmertForQuestionAnswering,
                          TrainingArguments, Trainer)

# Trainer compute_metrics() override for evaluation, prediction.
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support as
                             score)

# Required for Trainer prediction_step() override.
from transformers.trainer_pt_utils import (nested_concat, nested_detach)

# Instantiate tokenizer.
pretrained = f"{MODEL_PTH}{args.load}"
tokenizer = LxmertTokenizer.from_pretrained(f"{MODEL_PTH}lxmert-base-uncased")

# Instantiate VQA object: creates datasets, provides HAT methods.
vqa = VQA(tokenizer)


@dataclass
class VQATrainingArguments(TrainingArguments):
    """Required subclass to add joint loss
       coefficient for hyperparameter search."""
    x_lmbda: Optional[float] = field(
        default=args.x_lmbda, metadata={"help": "VQA-HLAT loss trade-off."})


class VQATrainer(Trainer):
    def prediction_step(self,
Exemplo n.º 16
0
# load object, attribute, and answer labels
with open(OBJ_PTH) as objf:
    objids = objf.read().splitlines()
with open(ATTR_PTH) as attf:
    attrids = attf.read().splitlines()
vqa_answers = utils.get_data(VQA_PTH)
vqa_labels = utils.get_data(LABEL_PTH)

# load models and model components
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned",
                                        config=frcnn_cfg)
image_preprocess = Preprocess(frcnn_cfg)

tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")


def get_h_att():
    """
    Load VQA 2.0 HLAT files.
    {"pre_attmap" : attention maps for question ids (ordered)}
    https://github.com/qiaott/HAN
    for question-image pairs:
        train_val: 658,111 attention maps
        test_dev:  107,394 attention maps
        test:      447,793 attention maps
    """
    trainval = h5py.File(f'{HAT_DATA}trainval2014_attention_maps.h5', 'r')
    test_dev = h5py.File(f'{HAT_DATA}test-dev2015_attention_maps.h5', 'r')
    test = h5py.File(f'{HAT_DATA}test2015_attention_maps.h5', 'r')
Exemplo n.º 17
0
    def __init__(self,
                 args,
                 train_loader=None,
                 val_loader=None,
                 logger=None,
                 train=True):
        super().__init__()

        self.args = args
        self.max_text_length = args.max_text_length

        self.train_loader = train_loader
        self.val_loader = val_loader

        self.logger = logger

        # Build model
        self.model = XLxmertForPretraining.from_pretrained(
            "bert-base-uncased", num_clusters=args.num_clusters)

        self.tokenizer = LxmertTokenizer.from_pretrained(
            'unc-nlp/lxmert-base-uncased')

        self.verbose = True
        if self.args.distributed:
            if self.args.gpu != 0:
                self.verbose = False

        if args.clustering:
            self.datasets_dir = Path(self.args.datasets_dir)
            clustering_dir = self.datasets_dir.joinpath('clustering')
            centroid_path = clustering_dir.joinpath(
                f'{args.encoder}_{args.cluster_src}_centroids{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.npy'
            )
            centroids = np.load(centroid_path)

            self.model.set_visual_embedding(centroids)

        # Load pre-trained weights
        self.start_epoch = None
        if args.load is not None:
            path = args.load + '_LXRT.pth'
            self.load(path, verbose=self.verbose)

        # GPU Options
        print(f'Model Launching at GPU {self.args.gpu}')

        from time import time
        start = time()
        self.model = self.model.to(args.gpu)

        # Optimizer
        if train:
            self.optim, self.lr_scheduler = self.create_optimizer_and_scheduler(
            )

            if self.args.fp16 and _use_native_amp:
                self.scaler = torch.cuda.amp.GradScaler()
            elif _use_apex:
                self.model, self.optim = amp.initialize(self.model,
                                                        self.optim,
                                                        opt_level='O1',
                                                        verbosity=self.verbose)

        if args.multiGPU:
            assert args.distributed
            self.model = DDP(self.model,
                             device_ids=[args.gpu],
                             find_unused_parameters=True)
        if args.gpu == 0:
            print(f'It took {time() - start:.1f}s')
Exemplo n.º 18
0
    def __init__(self, split='mscoco_mininval', topk=-1, data_out=['img'], verbose=True, args=None):

        self.data_out = data_out
        self.topk = topk
        self.verbose = verbose
        self.args = args

        self.datasets_dir = Path(self.args.datasets_dir)

        # Loading datasets to data
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        self.answer_table = AnswerTable()
        # if self.verbose:
        print("Load an answer table of size %d." % (len(self.answer_table.ans2id_map())))

        self.img_ids_to_source = {}

        data = []
        for img_source in self.sources:
            with open(self.datasets_dir.joinpath(f'data/lxmert/{img_source}.json')) as f:
                _data = json.load(f)
                if self.verbose:
                    print(f"Loaded {len(_data)} data from", img_source)
                # source_img_ids.append([d['img_id'] for d in _data])
                for datum in _data:
                    self.img_ids_to_source[datum['img_id']] = img_source
                    datum['img_source'] = img_source
                    datum['caption_only'] = args.caption_only
                    datum['clustering'] = args.clustering
                    datum['max_text_length'] = args.max_text_length
                    datum['qa'] = args.task_qa

                data.extend(_data)

        # Modify the answers
        if args.task_qa:
            for datum in data:
                labelf = datum['labelf']
                for _qa_source, labels in labelf.items():
                    for label in labels:
                        for ans in list(label.keys()):
                            new_ans = self.answer_table.convert_ans(ans)
                            if self.answer_table.used(new_ans):
                                if ans != new_ans:
                                    label[new_ans] = label.pop(ans)
                            else:
                                label.pop(ans)

        if self.topk > 0:
            data = data[:self.topk]
            if self.verbose:
                print(f"Use only {self.topk} data")

        if args.task_qa:
            self.evaluator = QAEvaluator(data)


        if args.clustering:
            clustering_dir = self.datasets_dir.joinpath('clustering')
            with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_mscoco_train_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f:
                mscoco_train_img_id_to_cluster_id = pickle.load(f)
            with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_mscoco_valid_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f:
                mscoco_valid_img_id_to_cluster_id = pickle.load(f)
            with open(clustering_dir.joinpath(f'{args.encoder}_{args.cluster_src}_vg_img_id_to_cluster_id_{args.n_centroids}_iter{args.n_iter}_d{args.feat_dim}_grid{args.grid_size}.pkl'), 'rb') as f:
                vg_img_id_to_cluster_id = pickle.load(f)

            self.data_source_to_cluster_data = {
                'mscoco_train': mscoco_train_img_id_to_cluster_id,
                'mscoco_minival': mscoco_valid_img_id_to_cluster_id,
                'mscoco_nominival': mscoco_valid_img_id_to_cluster_id,
                'vgnococo': vg_img_id_to_cluster_id
            }

        with Pool(8) as pool:
            if self.verbose:
                data = [datum for _data in tqdm(pool.imap(get_datum, data), total=len(data), ncols=100) for datum in _data]
            else:
                data = [datum for _data in pool.imap(get_datum, data) for datum in _data]

        if self.args.target_exact_feat or self.args.feed_exact_feat or self.args.target_obj_id:
            if args.grid_model:
                self.data_source_to_h5_path = {
                    'mscoco_train': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_train_grid{args.grid_size}.h5'),
                    'mscoco_minival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'),
                    'mscoco_nominival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'),
                    'vgnococo': self.datasets_dir.joinpath(f'VG/features/{args.encoder}_grid{args.grid_size}.h5'),
                }

            else:
                self.data_source_to_h5_path = {
                    'mscoco_train': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_train_boxes36.h5'),
                    'mscoco_minival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'),
                    'mscoco_nominival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'),
                    'vgnococo': self.datasets_dir.joinpath(f'VG/features/maskrcnn_boxes36.h5'),
                }

            for source, path in self.data_source_to_h5_path.items():
                assert path.is_file(), (source, path)

            self.source_to_h5 = None

        self.data = data

        if args.vis_mask_COCO_only:
            COCO_data = []
            for datum in self.data:
                if datum['text_source'] == 'mscoco' and 'mscoco' in datum['img_source']:
                    COCO_data.append(datum)
            self.COCO_data = COCO_data
            if self.verbose:
                print('# COCO captions:', len(self.COCO_data))

        if self.verbose:
            if 'sent' not in self.data_out:
                print("# all images:", len(self.data))
            else:
                print("# all sentences:", len(self.data))

        self.grid_size = args.grid_size
        self.n_grids = args.n_grids
        if self.args.grid_model:
            self.boxes = box_position(args.grid_size)
        else:
            self.n_boxes = args.n_boxes
            self.boxes = None

        self.tokenizer = LxmertTokenizer.from_pretrained(
            "bert-base-uncased",
            do_lower_case=True
        )

        self.max_text_length = args.max_text_length

        ###### Pretrainining Objective ######
        tasks = []
        if self.args.task_mask_lm:
            tasks.append('word_mask')
        if self.args.task_obj_predict:
            tasks.append('vis_mask')
        if self.args.task_matched:
            tasks.append('matched')
        if self.args.task_qa:
            tasks.append('qa')
        self.tasks = tasks

        if self.verbose:
            print('data_out:', self.data_out)
Exemplo n.º 19
0
    def __init__(self, args, dataset: VQADataset, split, verbose, topk=-1):
        super().__init__()
        self.raw_dataset = dataset
        self.args = args
        self.verbose = verbose

        self.datasets_dir = Path(self.args.datasets_dir)

        # Loading datasets to data
        self.sources = split.split(',')
        if self.verbose:
            print('Data sources: ', self.sources)

        self.img_ids_to_source = {}
        data_info_dicts = []

        for source in self.sources:
            data_info_path = self.datasets_dir.joinpath(f'data/vqa/{source}.json')
            with open(data_info_path) as f:
                _data_info_dicts = json.load(f)
                # source_img_ids.append([d['img_id'] for d in _data_info_dicts])
                for _d in _data_info_dicts:
                    self.img_ids_to_source[_d['img_id']] = source
                    _d['source'] = source
                data_info_dicts.extend(_data_info_dicts)
            if self.verbose:
                print(f"Loaded {len(_data_info_dicts)} data from", source)

        # data_info_dicts = self.raw_dataset.data

        if topk > 0:
            data_info_dicts = data_info_dicts[:topk]
            if self.verbose:
                print(f"Use only {topk} data")

        if args.grid_model:
            self.data_source_to_h5_path = {
                'train': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_train_grid{args.grid_size}.h5'),
                'minival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'),
                'nominival': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_valid_grid{args.grid_size}.h5'),
                'test': self.datasets_dir.joinpath(f'COCO/features/{args.encoder}_test_grid{args.grid_size}.h5'),
            }
        else:
            self.data_source_to_h5_path = {
                'train': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_train_boxes36.h5'),
                'minival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'),
                'nominival': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_valid_boxes36.h5'),
                'test': self.datasets_dir.joinpath(f'COCO/features/maskrcnn_test_boxes36.h5'),
            }

        for source, path in self.data_source_to_h5_path.items():
            assert path.is_file(), (source, path)

        self.source_to_h5 = None

        self.data = data_info_dicts
        if self.verbose:
            print("# all sentences:", len(self.data))

        self.grid_size = args.grid_size
        self.n_grids = args.n_grids
        if self.args.grid_model:
            self.boxes = box_position(args.grid_size)
        else:
            self.n_boxes = args.n_boxes
            self.boxes = None

        self.tokenizer = LxmertTokenizer.from_pretrained('unc-nlp/lxmert-base-uncased')

        self.max_text_length = args.max_text_length