def __init__(self, config): super(ResNetVLBERTForPretrainingEncDecGenerate, self).__init__(config) self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=False) self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) if config.NETWORK.IMAGE_FEAT_PRECOMPUTED: self.object_mask_visual_embedding = nn.Embedding(1, 2048) if config.NETWORK.WITH_MVRC_LOSS: self.object_mask_word_embedding = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBertEncoder( config.NETWORK.VLBERT, language_pretrained_model_path=None if config.NETWORK.VLBERT.from_scratch else language_pretrained_model_path, with_rel_head=False, with_mlm_head=False, with_mvrc_head=False, ) # FM addition: add decoder self.decoder = VisualLinguisticBertForPretrainingDecoder( config.NETWORK.VLBERT, language_pretrained_model_path=None if config.NETWORK.VLBERT.from_scratch else language_pretrained_model_path, with_rel_head=config.NETWORK.WITH_REL_LOSS, with_mlm_head=config.NETWORK.WITH_MLM_LOSS, with_mvrc_head=config.NETWORK.WITH_MVRC_LOSS, ) # init weights self.init_weight() self.fix_params()
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=False) self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path self.language_pretrained_model_path = language_pretrained_model_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path) transform = VisualLinguisticBertMVRCHeadTransform( config.NETWORK.VLBERT) # self.linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, 768) #331 1000 35 100 12003 lihui # self.OIM_loss = OIM_Module(331, 768) # config.NETWORK.VLBERT.hidden_size) self.OIM_loss = OIM_Module(12003, 768) self.linear = nn.Sequential( # transform, nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), nn.Linear(config.NETWORK.VLBERT.hidden_size, 768) #331 1000 35 100 12003 lihui ) linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, 1) self.final_mlp = nn.Sequential( transform, nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), linear) # init weights self.init_weight() self.fix_params()
def __init__(self, config): super(ResNetVLBERTForPretrainingMultitaskNoVision, self).__init__(config) # Constructs/initialises model elements self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=False) self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) if config.NETWORK.IMAGE_FEAT_PRECOMPUTED or ( not config.NETWORK.MASK_RAW_PIXELS): self.object_mask_visual_embedding = nn.Embedding(1, 2048) if config.NETWORK.WITH_MVRC_LOSS: self.object_mask_word_embedding = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) self.aux_text_visual_embedding = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) # Can specify pre-trained model or use the downloaded pretrained model specific in .yaml file language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': # language_pretrained_model_path = '{}-{:04d}.model'.format(config.NETWORK.BERT_PRETRAINED, # config.NETWORK.BERT_PRETRAINED_EPOCH) #FM edit: just use path of pretrained model language_pretrained_model_path = config.NETWORK.BERT_PRETRAINED elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBertForPretraining( config.NETWORK.VLBERT, language_pretrained_model_path=None if config.NETWORK.VLBERT.from_scratch else language_pretrained_model_path, with_rel_head=config.NETWORK.WITH_REL_LOSS, with_mlm_head=config.NETWORK.WITH_MLM_LOSS, with_mvrc_head=config.NETWORK.WITH_MVRC_LOSS, with_MLT_head=config.NETWORK.WITH_MLT_LOSS) # init weights self.init_weight() self.fix_params()
def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, **kwargs): """ VREP Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to dataset :param boxes: boxes to use, 'gt' or 'proposal' :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(VRep, self).__init__() assert not cache_mode, 'currently not support cache mode!' self.data_json = 'obj_det_res.json'#'image_seg_test.json'#'obj_det_res.json' self.ref_json = 'ref_annotations.json' self.boxes = boxes self.refer = Refer() self.test_mode = test_mode self.data_path = data_path self.root_path = root_path self.transform = transform self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) if zip_mode: self.zipreader = ZipReader() self.database = self.load_annotations() if self.aspect_grouping: self.group_ids = self.group_aspect(self.database)
def __init__(self, ann_file, image_set, root_path, data_path, transform=None, task='Q2A', test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, basic_tokenizer=None, tokenizer=None, pretrained_model_name=None, only_use_relevant_dets=False, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, basic_align=False, qa2r_noq=False, qa2r_aug=False, seq_len=64, **kwargs): """ Visual Commonsense Reasoning Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param task: 'Q2A' means question to answer, 'QA2R' means question and answer to rationale, 'Q2AR' means question to answer and rationale :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param only_use_relevant_dets: filter out detections not used in query and response :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param basic_align: align to tokens retokenized by basic_tokenizer :param qa2r_noq: in QA->R, the query contains only the correct answer, without question :param qa2r_aug: in QA->R, whether to augment choices to include those with wrong answer in query :param kwargs: """ super(TwitterDataset, self).__init__() self.cache_dir = os.path.join(root_path, 'cache') assert not cache_mode, 'currently not support cache mode!' self.data_path = data_path self.test_mode = test_mode self.ann_file = os.path.join(text_path, ann_file) self.image_set = image_set self.transform = transform self.cache_mode = cache_mode self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \ else BasicTokenizer(do_lower_case=True) if tokenizer is None: if pretrained_model_name is None: pretrained_model_name = 'bert-base-uncased' if 'roberta' in pretrained_model_name: tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir) else: tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir) self.tokenizer = tokenizer self.database = self.load_annotations(self.ann_file)
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.config = config self.pre_resnet = resnet152() self.pre_resnet.load_state_dict(torch.load('/home/data/datasets/resnet152-b121ed2d.pth')) print('load resnet152 pretrained rpbert') self.object_visual_embeddings = nn.Linear(2048, config.NETWORK.VLBERT.hidden_size) self.object_linguistic_embeddings = nn.Embedding(1, config.NETWORK.VLBERT.hidden_size) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained(config.NETWORK.BERT_MODEL_NAME) self.vlbert = VisualLinguisticBert(config.NETWORK.VLBERT) # init weights self.init_weight()
def __init__(self, ann_file, pretrained_model_name, tokenizer=None, seq_len=64, encoding="utf-8", on_memory=True, **kwargs): assert on_memory, "only support on_memory mode!" self.tokenizer = tokenizer if tokenizer is not None else BertTokenizer.from_pretrained(pretrained_model_name) self.vocab = self.tokenizer.vocab self.seq_len = seq_len self.on_memory = on_memory self.ann_file = ann_file self.encoding = encoding self.test_mode = False # load samples into memory if on_memory: self.corpus = self.load_corpus()
def __init__(self, flickr_root, snlive_root, annotations_file, image_set, roi_set, transform=None, test_mode=False, basic_tokenizer=None, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=True, **kwargs): """ Visual Grounded Dataset :param image_set: image folder name, e.g., 'vcr1images' :param transform: transform :param test_mode: test mode means no labels available :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param kwargs: """ super(SnliVEDataset, self).__init__() self.annotations_file = os.path.join(snlive_root, annotations_file) self.image_set = os.path.join(flickr_root, image_set) self.roi_set = os.path.join(flickr_root, roi_set) self.transform = transform self.test_mode = test_mode self.add_image_as_a_box = add_image_as_a_box self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \ else BasicTokenizer(do_lower_case=True) if tokenizer is None: if pretrained_model_name is None: pretrained_model_name = 'bert-base-uncased' if 'roberta' in pretrained_model_name: tokenizer = RobertaTokenizer.from_pretrained( pretrained_model_name) else: tokenizer = BertTokenizer.from_pretrained( pretrained_model_name) self.tokenizer = tokenizer self.database = self.load_captions()
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=False) self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path self.language_pretrained_model_path = language_pretrained_model_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path) self.task1_head = Task1Head(config.NETWORK.VLBERT) self.task2_head = Task2Head(config.NETWORK.VLBERT) self.task3_head = Task3Head(config.NETWORK.VLBERT) # init weights self.init_weight() self.fix_params()
def __init__(self, split, cfg, transform): super().__init__() self.split = split self.cfg = cfg self.transform = transform self.annotations = [] n_img = 0 for img in json.load(open(self.cfg.DATAPATH)): split = split + 'id' if split == 'val' else split # 'val' -> 'valid' if img['split'] in split.split('_'): # if img['split'] == split: n_img += 1 for annot in img['annotations']: if cfg.TEST.EXCL_LEFT_RIGHT and ( annot['predicate'] == 'to the left of' or annot['predicate'] == 'to the right of'): continue annot['url'] = img['url'] annot['height'] = img['height'] annot['width'] = img['width'] annot['subject']['bbox'] = self.fix_bbox( annot['subject']['bbox'], img['height'], img['width']) annot['object']['bbox'] = self.fix_bbox( annot['object']['bbox'], img['height'], img['width']) self.annotations.append(annot) print('%d relations in %s' % (len(self.annotations), split)) print('%d imgs in %s' % (n_img, split)) self.cache_dir = os.path.join(cfg.DATASET.ROOT_PATH, 'cache') if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) if cfg.NETWORK.BERT_MODEL_NAME: print('Initializing BERT tokenizer from', cfg.NETWORK.BERT_MODEL_NAME) self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' if cfg.NETWORK.BERT_MODEL_NAME is None else cfg.NETWORK.BERT_MODEL_NAME, cache_dir=self.cache_dir)
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS self.cnn_loss_top = config.NETWORK.CNN_LOSS_TOP if not config.NETWORK.BLIND: self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=(self.enable_cnn_reg_loss and not self.cnn_loss_top)) if config.NETWORK.VLBERT.object_word_embed_mode == 1: self.object_linguistic_embeddings = nn.Embedding( 81, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 2: self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 3: self.object_linguistic_embeddings = None else: raise NotImplementedError if self.enable_cnn_reg_loss and self.cnn_loss_top: self.cnn_loss_reg = nn.Sequential( VisualLinguisticBertMVRCHeadTransform( config.NETWORK.VLBERT), nn.Dropout(config.NETWORK.CNN_REG_DROPOUT, inplace=False), nn.Linear(config.NETWORK.VLBERT.hidden_size, 81)) self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN if 'roberta' in config.NETWORK.BERT_MODEL_NAME: self.tokenizer = RobertaTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) else: self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = TimeDistributed( VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path)) self.for_pretrain = config.NETWORK.FOR_MASK_VL_MODELING_PRETRAIN assert not self.for_pretrain, "Not implement pretrain mode now!" if not self.for_pretrain: dim = config.NETWORK.VLBERT.hidden_size if config.NETWORK.CLASSIFIER_TYPE == "2fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, 1), ) elif config.NETWORK.CLASSIFIER_TYPE == "1fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, 1)) else: raise ValueError("Not support classifier type: {}!".format( config.NETWORK.CLASSIFIER_TYPE)) # init weights self.init_weight() self.fix_params()
def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64, with_precomputed_visual_feat=False, mask_raw_pixels=True, with_rel_task=True, with_mlm_task=False, with_mvrc_task=False, transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, aspect_grouping=False, languages_used='first', MLT_vocab='bert-base-german-cased-vocab.txt', **kwargs): """ Conceptual Captions Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(Multi30kDataset2018, self).__init__() assert not cache_mode, 'currently not support cache mode!' # TODO: need to remove this to allows testing # assert not test_mode annot = {'train': 'train_MLT_frcnn.json', 'val': 'val_MLT_frcnn.json', 'test2015': 'test_MLT_2018_renamed_frcnn.json'} self.seq_len = seq_len self.with_rel_task = with_rel_task self.with_mlm_task = with_mlm_task self.with_mvrc_task = with_mvrc_task self.data_path = data_path self.root_path = root_path self.ann_file = os.path.join(data_path, annot[image_set]) self.with_precomputed_visual_feat = with_precomputed_visual_feat self.mask_raw_pixels = mask_raw_pixels self.image_set = image_set self.transform = transform self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping #FM edit: added option for how many captions self.languages_used = languages_used self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) self.zipreader = ZipReader() # FM: Customise for multi30k dataset self.database = list(jsonlines.open(self.ann_file)) if not self.zip_mode: for i, idb in enumerate(self.database): self.database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\ .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '') self.database[i]['image'] = idb['image'].replace('.zip@', '') if self.aspect_grouping: assert False, "not support aspect grouping currently!" self.group_ids = self.group_aspect(self.database) print('mask_raw_pixels: ', self.mask_raw_pixels) #FM: initialise vocabulary for output self.MLT_vocab_path = os.path.join(root_path, 'model/pretrained_model', MLT_vocab) self.MLT_vocab = [] with open(self.MLT_vocab_path) as fp: for cnt, line in enumerate(fp): self.MLT_vocab.append(line.strip())
def __init__(self, image_set, root_path, data_path, answer_vocab_file, use_imdb=True, with_precomputed_visual_feat=False, boxes="36", transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=True, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, toy_dataset=False, toy_samples=128, **kwargs): """ Visual Question Answering Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(VQA_CP, self).__init__() assert not cache_mode, 'currently not support cache mode!' categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove', 'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush'] vqa_question = { "train": "vqa/vqacp_v2_train_questions.json", "val": "vqa/vqacp_v2_test_questions.json", } vqa_annot = { "train": "vqa/vqacp_v2_train_annotations.json", "val": "vqa/vqacp_v2_test_annotations.json", } if boxes == "36": precomputed_boxes = { 'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"), 'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"), } elif boxes == "10-100ada": precomputed_boxes = { 'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"), 'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"), } else: raise ValueError("Not support boxes: {}!".format(boxes)) self.coco_dataset = { "train2014": os.path.join(data_path, "annotations", "instances_train2014.json"), "val2014": os.path.join(data_path, "annotations", "instances_val2014.json"), "test-dev2015": os.path.join(data_path, "annotations", "image_info_test-dev2015.json"), "test2015": os.path.join(data_path, "annotations", "image_info_test2015.json"), } self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)") self.commaStrip = re.compile("(\d)(\,)(\d)") self.punct = [';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!'] self.boxes = boxes self.test_mode = test_mode self.with_precomputed_visual_feat = with_precomputed_visual_feat self.category_to_idx = {c: i for i, c in enumerate(categories)} self.data_path = data_path self.root_path = root_path # load the answer vocab file: same as vqav2 dataset with open(answer_vocab_file, 'r', encoding='utf8') as f: self.answer_vocab = [w.lower().strip().strip('\r').strip('\n').strip('\r') for w in f.readlines()] self.answer_vocab = list(filter(lambda x: x != '', self.answer_vocab)) self.answer_vocab = [self.processPunctuation(w) for w in self.answer_vocab] # The config.DATA.TRAIN_IMAGE_SET and config.DATA.VAL_IMAGE_SET have # a little different use here, it indicates the mode 'train' or 'val' self.image_sets = [iset.strip() for iset in image_set.split('+')] self.ann_files = [os.path.join(data_path, vqa_annot[iset]) for iset in self.image_sets] \ if not self.test_mode else [None for iset in self.image_sets] self.q_files = [os.path.join(data_path, vqa_question[iset]) for iset in self.image_sets] self.precomputed_box_files = [ os.path.join(data_path, precomputed_boxes[iset][0], precomputed_boxes[iset][1]) for iset in self.image_sets] self.box_bank = {} self.coco_datasets = [os.path.join(data_path, '{}', 'COCO_{}_{{:012d}}.jpg') for iset in self.image_sets] self.transform = transform self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) if zip_mode: self.zipreader = ZipReader() self.database = self.load_annotations() if self.aspect_grouping: self.group_ids = self.group_aspect(self.database) # toy dataset if toy_dataset: print(f"Using the toy dataset!! Total samples = {toy_samples}") self.database = self.database[:toy_samples]
def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64, with_precomputed_visual_feat=False, mask_raw_pixels=True, with_rel_task=True, with_mlm_task=True, with_mvrc_task=True, transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, aspect_grouping=False, **kwargs): """ Conceptual Captions Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(ParallelTextDataset, self).__init__() assert not cache_mode, 'currently not support cache mode!' assert not test_mode annot = { 'train': 'train.json', 'val': 'test.json', 'test': 'test.json' } self.seq_len = seq_len self.with_rel_task = with_rel_task self.with_mlm_task = with_mlm_task self.with_mvrc_task = with_mvrc_task self.data_path = data_path self.root_path = root_path self.ann_file = os.path.join(data_path, annot[image_set]) self.with_precomputed_visual_feat = with_precomputed_visual_feat self.mask_raw_pixels = mask_raw_pixels self.image_set = image_set self.transform = transform self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) self.zipreader = ZipReader() # FM: Customise for multi30k dataset self.database = list(jsonlines.open(self.ann_file)) if self.aspect_grouping: assert False, "not support aspect grouping currently!" self.group_ids = self.group_aspect(self.database) print('mask_raw_pixels: ', self.mask_raw_pixels)
def __init__(self, split, cfg, transform): super().__init__() self.split = split self.cfg = cfg self.transform = transform self.all_proposals_test = False if cfg.DATASET.ALL_PROPOSALS_TEST: self.all_proposals_test = True self.annotations = [] # Load images self.path = self.cfg.TEST_PATH if split == 'test' else self.cfg.TRAIN_VAL_PATH imgs = json.load(open(self.path)) skipped_count = 0 for img in imgs: if img['path'].endswith('.png'): img['path'] = '.'.join([img['path'].split('.')[0], 'jpg']) rels_cand = None if self.all_proposals_test and split != 'train': rels_cand = [] nb_of_objs = len(img['objects']) if nb_of_objs > cfg.DATASET.MAX_NB_OF_OBJ: nb_of_objs = min(cfg.DATASET.MAX_NB_OF_OBJ, nb_of_objs) skipped_count += 1 for sub_id in range(0, nb_of_objs): for obj_id in range(0, nb_of_objs): if sub_id == obj_id: continue rels_cand.append((sub_id, obj_id)) annot = { 'img_path': img['path'], 'annot': img['relationships'], 'objects': img['objects'], 'rels_cand': rels_cand, } self.annotations.append(annot) print( f'number of imgs with skipped objs (skipped_count): {skipped_count}' ) print('%d imgs in %s' % (len(self.annotations), split)) # categories self.num_object_classes = len(self.cfg.OBJECT_CATEGORIES) self._object_class_to_ind = dict( zip(self.cfg.OBJECT_CATEGORIES, range(self.num_object_classes))) self.num_predicate_classes = len(self.cfg.PREDICATE_CATEGORIES) self._predicate_class_to_ind = dict( zip(self.cfg.PREDICATE_CATEGORIES, range(self.num_predicate_classes))) self.cache_dir = os.path.join(cfg.DATASET.ROOT_PATH, 'cache') if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' if cfg.NETWORK.BERT_MODEL_NAME is None else cfg.NETWORK.BERT_MODEL_NAME, cache_dir=self.cache_dir) self.sample_rels = cfg.TRAIN.SAMPLE_RELS
def __init__(self, root_path, data_path, boxes='gt', proposal_source='official', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, **kwargs): """ Foil Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to dataset :param boxes: boxes to use, 'gt' or 'proposal' :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(Foil, self).__init__() assert not cache_mode, 'currently not support cache mode!' coco_annot_files = { "train2014": "annotations/instances_train2014.json", "val2014": "annotations/instances_val2014.json", "test2015": "annotations/image_info_test2015.json", } foil_annot_files = { "train": "foil/foilv1.0_train_2017.json", "test": "foil/foilv1.0_test_2017.json" } foil_vocab_file = "foil/vocab.txt" self.vg_proposal = ("vgbua_res101_precomputed", "trainval2014_resnet101_faster_rcnn_genome") self.test_mode = test_mode self.data_path = data_path self.root_path = root_path self.transform = transform vocab_file = open(os.path.join(data_path, foil_vocab_file), 'r') vocab_lines = vocab_file.readlines() vocab_lines = [v.strip() for v in vocab_lines] self.itos = vocab_lines self.stoi = dict(list(zip(self.itos, range(len(vocab_lines))))) if self.test_mode: self.image_set = "val2014" coco_annot_file = coco_annot_files["val2014"] else: self.image_set = "train2014" coco_annot_file = coco_annot_files["train2014"] self.coco = COCO( annotation_file=os.path.join(data_path, coco_annot_file)) self.foil = FOIL(data_path, 'train' if not test_mode else 'test') self.foil_ids = list(self.foil.Foils.keys()) self.foils = self.foil.loadFoils(foil_ids=self.foil_ids) if 'proposal' in boxes: with open(os.path.join(data_path, proposal_dets), 'r') as f: proposal_list = json.load(f) self.proposals = {} for proposal in proposal_list: image_id = proposal['image_id'] if image_id in self.proposals: self.proposals[image_id].append(proposal['box']) else: self.proposals[image_id] = [proposal['box']] self.boxes = boxes self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) if zip_mode: self.zipreader = ZipReader() self.database = self.load_annotations() if self.aspect_grouping: self.group_ids = self.group_aspect(self.database)
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.predict_on_cls = config.NETWORK.VLBERT.predict_on_cls # make prediction on [CLS]? self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS if not config.NETWORK.BLIND: self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=self.enable_cnn_reg_loss) if config.NETWORK.VLBERT.object_word_embed_mode == 1: self.object_linguistic_embeddings = nn.Embedding( 81, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 2: # default: class-agnostic self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 3: self.object_linguistic_embeddings = None else: raise NotImplementedError self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path self.language_pretrained_model_path = language_pretrained_model_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path) dim = config.NETWORK.VLBERT.hidden_size if config.NETWORK.CLASSIFIER_TYPE == "2fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, config.DATASET.ANSWER_VOCAB_SIZE), ) elif config.NETWORK.CLASSIFIER_TYPE == "1fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.DATASET.ANSWER_VOCAB_SIZE)) elif config.NETWORK.CLASSIFIER_TYPE == 'mlm': transform = BertPredictionHeadTransform(config.NETWORK.VLBERT) linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.DATASET.ANSWER_VOCAB_SIZE) self.final_mlp = nn.Sequential( transform, nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), linear) else: raise ValueError("Not support classifier type: {}!".format( config.NETWORK.CLASSIFIER_TYPE)) self.use_spatial_model = False if config.NETWORK.USE_SPATIAL_MODEL: self.use_spatial_model = True # self.simple_spatial_model = SimpleSpatialModel(4, config.NETWORK.VLBERT.hidden_size, 9, config) self.use_coord_vector = False if config.NETWORK.USE_COORD_VECTOR: self.use_coord_vector = True self.loc_fcs = nn.Sequential( nn.Linear(2 * 5 + 9, config.NETWORK.VLBERT.hidden_size), nn.ReLU(True), nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size)) else: self.simple_spatial_model = SimpleSpatialModel( 4, config.NETWORK.VLBERT.hidden_size, 9) self.spa_add = True if config.NETWORK.SPA_ADD else False self.spa_concat = True if config.NETWORK.SPA_CONCAT else False if self.spa_add: self.spa_feat_weight = 0.5 if config.NETWORK.USE_SPA_WEIGHT: self.spa_feat_weight = config.NETWORK.SPA_FEAT_WEIGHT self.spa_fusion_linear = nn.Linear( config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) elif self.spa_concat: if self.use_coord_vector: self.spa_fusion_linear = nn.Linear( config.NETWORK.VLBERT.hidden_size + config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) else: self.spa_fusion_linear = nn.Linear( config.NETWORK.VLBERT.hidden_size * 2, config.NETWORK.VLBERT.hidden_size) self.spa_linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) self.dropout = nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT) self.spa_one_more_layer = config.NETWORK.SPA_ONE_MORE_LAYER if self.spa_one_more_layer: self.spa_linear_hidden = nn.Linear( config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) self.enhanced_img_feature = False if config.NETWORK.VLBERT.ENHANCED_IMG_FEATURE: self.enhanced_img_feature = True self.mask_weight = config.NETWORK.VLBERT.mask_weight self.mask_loss_sum = config.NETWORK.VLBERT.mask_loss_sum self.mask_loss_mse = config.NETWORK.VLBERT.mask_loss_mse self.no_predicate = config.NETWORK.VLBERT.NO_PREDICATE self.all_proposals_test = False if config.DATASET.ALL_PROPOSALS_TEST: self.all_proposals_test = True self.use_uvtranse = False if config.NETWORK.USE_UVTRANSE: self.use_uvtranse = True self.union_vec_fc = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) self.uvt_add = True if config.NETWORK.UVT_ADD else False self.uvt_concat = True if config.NETWORK.UVT_CONCAT else False if not (self.uvt_add ^ self.uvt_concat): assert False if self.uvt_add: self.uvt_feat_weight = config.NETWORK.UVT_FEAT_WEIGHT self.uvt_fusion_linear = nn.Linear( config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) elif self.uvt_concat: self.uvt_fusion_linear = nn.Linear( config.NETWORK.VLBERT.hidden_size * 2, config.NETWORK.VLBERT.hidden_size) self.uvt_linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) self.dropout_uvt = nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT) # init weights self.init_weight()
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS self.cnn_loss_top = config.NETWORK.CNN_LOSS_TOP self.align_caption_img = config.DATASET.ALIGN_CAPTION_IMG self.use_phrasal_paraphrases = config.DATASET.PHRASE_CLS self.supervise_attention = config.NETWORK.SUPERVISE_ATTENTION self.normalization = config.NETWORK.ATTENTION_NORM_METHOD self.ewc_reg = config.NETWORK.EWC_REG self.importance_hparam = 0. if config.NETWORK.EWC_REG: self.fisher = pickle.load(open(config.NETWORK.FISHER_PATH, "rb")) self.pretrain_param = torch.load(config.NETWORK.PARAM_PRETRAIN) self.importance_hparam = config.NETWORK.EWC_IMPORTANCE if not config.NETWORK.BLIND: self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=(self.enable_cnn_reg_loss and not self.cnn_loss_top)) if config.NETWORK.VLBERT.object_word_embed_mode == 1: self.object_linguistic_embeddings = nn.Embedding( 81, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 2: self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 3: self.object_linguistic_embeddings = None else: raise NotImplementedError self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN if 'roberta' in config.NETWORK.BERT_MODEL_NAME: self.tokenizer = RobertaTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) else: self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path) self.for_pretrain = False dim = config.NETWORK.VLBERT.hidden_size if self.align_caption_img: sentence_logits_shape = 3 else: sentence_logits_shape = 1 if config.NETWORK.SENTENCE.CLASSIFIER_TYPE == "2fc": self.sentence_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear( dim, config.NETWORK.SENTENCE.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.SENTENCE.CLASSIFIER_HIDDEN_SIZE, sentence_logits_shape), ) elif config.NETWORK.SENTENCE.CLASSIFIER_TYPE == "1fc": self.sentence_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, sentence_logits_shape)) else: raise ValueError("Classifier type: {} not supported!".format( config.NETWORK.SENTENCE.CLASSIFIER_TYPE)) if self.use_phrasal_paraphrases: if config.NETWORK.PHRASE.CLASSIFIER_TYPE == "2fc": self.phrasal_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear( 4 * dim, config.NETWORK.PHRASE.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear( config.NETWORK.PHRASE.CLASSIFIER_HIDDEN_SIZE, 5), ) elif config.NETWORK.PHRASE.CLASSIFIER_TYPE == "1fc": self.phrasal_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(4 * dim, 5)) else: raise ValueError("Classifier type: {} not supported!".format( config.NETWORK.PHRASE.CLASSIFIER_TYPE)) if self.supervise_attention == "indirect": if config.NETWORK.VG.CLASSIFIER_TYPE == "2fc": self.vg_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(2 * dim, config.NETWORK.VG.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.VG.CLASSIFIER_HIDDEN_SIZE, 1), ) elif config.NETWORK.VG.CLASSIFIER_TYPE == "1fc": self.vg_cls = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(2 * dim, 1)) else: raise ValueError("Classifier type: {} not supported!".format( config.NETWORK.PHRASE.CLASSIFIER_TYPE)) # init weights self.init_weight() self.fix_params()
def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64, with_precomputed_visual_feat=False, mask_raw_pixels=True, with_rel_task=True, with_mlm_task=False, with_mvrc_task=False, transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, aspect_grouping=False, languages_used='first', **kwargs): """ Conceptual Captions Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(Multi30kDataset_5x_Mixed, self).__init__() assert not cache_mode, 'currently not support cache mode!' # TODO: need to remove this to allows testing # assert not test_mode annot = {'train': 'train_frcnn_5captions_both.json', 'val': 'val_frcnn.json', 'test2015': 'test_frcnn.json'} self.seq_len = seq_len self.with_rel_task = with_rel_task self.with_mlm_task = with_mlm_task self.with_mvrc_task = with_mvrc_task self.data_path = data_path self.root_path = root_path self.ann_file = os.path.join(data_path, annot[image_set]) self.with_precomputed_visual_feat = with_precomputed_visual_feat self.mask_raw_pixels = mask_raw_pixels self.image_set = image_set self.transform = transform self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping #FM edit: added option for how many captions self.languages_used = languages_used self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) self.zipreader = ZipReader() # FM: Customise for multi30k dataset if not self.test_mode: self.database = list(jsonlines.open(self.ann_file)) db_size = len(self.database) print('**************') print('Size before: ', db_size) if not self.zip_mode: for i, idb in enumerate(self.database): self.database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\ .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '') self.database[i]['image'] = idb['image'].replace('.zip@', '') # double database - one is used for english one for german database_2 = copy.deepcopy(self.database) self.database = self.database + database_2 print('**************') print('Size after: ', len(self.database)) for i, idb in enumerate(self.database): if i<db_size: self.database[i]['lang'] = 'first' else: self.database[i]['lang'] = 'second' # FM edit: create dataset for test mode else: self.simple_database = list(jsonlines.open(self.ann_file)) if not self.zip_mode: for i, idb in enumerate(self.simple_database): self.simple_database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\ .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '') self.simple_database[i]['image'] = idb['image'].replace('.zip@', '') # create database cross-coupling each caption with all images self.database = [] db_index = 0 for x, idb_x in enumerate(self.simple_database): for y, idb_y in enumerate(self.simple_database): self.database.append({}) self.database[db_index]['label'] = 1.0 if x==y else 0.0 self.database[db_index]['caption_en'] = self.simple_database[x]['caption_en'] self.database[db_index]['caption_de'] = self.simple_database[x]['caption_de'] self.database[db_index]['image'] = self.simple_database[y]['image'] self.database[db_index]['frcnn'] = self.simple_database[y]['frcnn'] self.database[db_index]['caption_index'] = x self.database[db_index]['image_index'] = y db_index += 1 if self.aspect_grouping: assert False, "not support aspect grouping currently!" self.group_ids = self.group_aspect(self.database) print('mask_raw_pixels: ', self.mask_raw_pixels)
def __init__(self, config): super(ResNetVLBERTv5, self).__init__(config) self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS if not config.NETWORK.BLIND: self.image_feature_extractor = FastRCNN( config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=self.enable_cnn_reg_loss) if config.NETWORK.VLBERT.object_word_embed_mode == 1: self.object_linguistic_embeddings = nn.Embedding( 601, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 2: self.object_linguistic_embeddings = nn.Embedding( 1, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 3: self.object_linguistic_embeddings = None else: raise NotImplementedError self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained( config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format( config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path self.language_pretrained_model_path = language_pretrained_model_path if language_pretrained_model_path is None: print( "Warning: no pretrained language model found, training from scratch!!!" ) self.vlbert = VisualLinguisticBert( config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path) # self.hm_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) # self.hi_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) self.hidden_dropout = nn.Dropout(0.2) if config.NETWORK.VLBERT.num_hidden_layers == 24: self.gating = nn.Parameter(torch.tensor([ 0.0067, 0.0070, 0.0075, 0.0075, 0.0075, 0.0074, 0.0076, 0.0075, 0.0076, 0.0080, 0.0079, 0.0086, 0.0096, 0.0101, 0.0104, 0.0105, 0.0111, 0.0120, 0.0126, 0.0115, 0.0108, 0.0105, 0.0104, 0.0117 ]), requires_grad=True) else: self.gating = nn.Parameter( torch.ones(config.NETWORK.VLBERT.num_hidden_layers, ) * 1e-2, requires_grad=True) self.train_steps = 0 dim = config.NETWORK.VLBERT.hidden_size if config.NETWORK.CLASSIFIER_TYPE == "2fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, config.NETWORK.CLASSIFIER_CLASS), ) elif config.NETWORK.CLASSIFIER_TYPE == "1fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_CLASS)) elif config.NETWORK.CLASSIFIER_TYPE == 'mlm': transform = BertPredictionHeadTransform(config.NETWORK.VLBERT) linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.CLASSIFIER_CLASS) self.final_mlp = nn.Sequential( transform, nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), linear) else: raise ValueError("Not support classifier type: {}!".format( config.NETWORK.CLASSIFIER_TYPE)) # init weights self.init_weight() self.fix_params()
def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, **kwargs): """ Market1501 Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to dataset :param boxes: boxes to use, 'gt' or 'proposal' :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(PA100K, self).__init__() assert not cache_mode, 'currently not support cache mode!' self.vg_proposal = ("vgbua_res101_precomputed", "trainval2014_resnet101_faster_rcnn_genome") self.proposal_source = proposal_source self.boxes = boxes self.test_mode = test_mode self.data_path = data_path self.root_path = root_path self.transform = transform self.image_sets = [iset.strip() for iset in image_set.split('+')] self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) self.trainval_id_to_cls = {} self.image_nums = 0 # self.imgid2entry = {} self.ps_map = {} self.imgid2psid = {} self.trainval_index_to_id = {} self.image_entries = [] self.pa100k_attribute = self.generate_data_description() self.database = self.load_annotations(self.pa100k_attribute) # if self.aspect_grouping: # self.group_ids = self.group_aspect(self.database) self.part = 7 self.max_boxes = 7 self.max_word = 26 self.val_images = [] self.val_boxes = [] self.val_im_info = [] self.val_ids = [] self.val_feat = [] self.diff = 2
def __init__(self, ann_file, image_set, root_path, data_path, transform=None, task='Q2A', test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, basic_tokenizer=None, tokenizer=None, pretrained_model_name=None, only_use_relevant_dets=False, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, basic_align=False, qa2r_noq=False, qa2r_aug=False, seq_len=64, **kwargs): """ Visual Commonsense Reasoning Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param task: 'Q2A' means question to answer, 'QA2R' means question and answer to rationale, 'Q2AR' means question to answer and rationale :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param only_use_relevant_dets: filter out detections not used in query and response :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param basic_align: align to tokens retokenized by basic_tokenizer :param qa2r_noq: in QA->R, the query contains only the correct answer, without question :param qa2r_aug: in QA->R, whether to augment choices to include those with wrong answer in query :param kwargs: """ super(VCRDataset, self).__init__() assert not cache_mode, 'currently not support cache mode!' assert task in ['Q2A', 'QA2R', 'Q2AR'] , 'not support task {}'.format(task) assert not qa2r_aug, "Not implemented!" self.qa2r_noq = qa2r_noq self.qa2r_aug = qa2r_aug self.seq_len = seq_len categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove', 'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush'] self.category_to_idx = {c: i for i, c in enumerate(categories)} self.data_path = data_path self.root_path = root_path self.ann_file = os.path.join(data_path, ann_file) self.image_set = image_set self.transform = transform self.task = task self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.basic_align = basic_align print('Dataset Basic Align: {}'.format(self.basic_align)) self.cache_dir = os.path.join(root_path, 'cache') self.only_use_relevant_dets = only_use_relevant_dets self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \ else BasicTokenizer(do_lower_case=True) if tokenizer is None: if pretrained_model_name is None: pretrained_model_name = 'bert-base-uncased' if 'roberta' in pretrained_model_name: tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir) else: tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir) self.tokenizer = tokenizer if zip_mode: self.zipreader = ZipReader() self.database = self.load_annotations(self.ann_file) if self.aspect_grouping: assert False, "Not support aspect grouping now!" self.group_ids = self.group_aspect(self.database) self.person_name_id = 0
def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, parts=1, number_sep=1, part_methods='VS', **kwargs): """ RefCOCO+ Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to dataset :param boxes: boxes to use, 'gt' or 'proposal' :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(Pedes, self).__init__() assert not cache_mode, 'currently not support cache mode!' self.pedes_annot_files = { "trainval": "trainval.json", } self.vg_proposal = ("vgbua_res101_precomputed", "trainval2014_resnet101_faster_rcnn_genome") self.proposal_source = proposal_source self.boxes = boxes self.test_mode = test_mode self.data_path = data_path self.root_path = root_path self.transform = transform self.image_sets = [iset.strip() for iset in image_set.split('+')] # self.coco = COCO(annotation_file=os.path.join(data_path, coco_annot_files['train2014'])) # self.refer = REFER(data_path, dataset='refcoco+', splitBy='unc') # self.refer_ids = [] # for iset in self.image_sets: # self.refer_ids.extend(self.refer.getRefIds(split=iset)) # self.refs = self.refer.loadRefs(ref_ids=self.refer_ids) self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) self.trainval_id_to_cls = {} self.image_nums = 0 self.imgid2entry = {} self.ps_map = {} self.imgid2psid = {} self.trainval_index_to_id = {} f = open( os.path.join(self.data_path, self.pedes_annot_files['trainval'])) self.setting = json.load(f) self.database = self.load_annotations() # if self.aspect_grouping: # self.group_ids = self.group_aspect(self.database) self.part = parts self.max_word = 50 self.val_images = [] self.val_boxes = [] self.val_im_info = [] self.val_ids = [] self.val_feat = [] self.diff = 7 self.use_JPP = False if part_methods == 'KS': self.use_JPP = True self.number_sep = number_sep self.number_parts = self.number_sep * self.part - self.number_sep + 1 if self.use_JPP: f_box = open(os.path.join(self.data_path, 'result.json')) #box_frcnn.json self.JPP_boxes = json.load(f_box)
def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, **kwargs): """ RefCOCO+ Dataset :param image_set: image folder name :param root_path: root path to cache database loaded from annotation file :param data_path: path to dataset :param boxes: boxes to use, 'gt' or 'proposal' :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(RefCOCO, self).__init__() assert not cache_mode, 'currently not support cache mode!' categories = [ '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove', 'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush' ] coco_annot_files = { "train2014": "annotations/instances_train2014.json", "val2014": "annotations/instances_val2014.json", "test2015": "annotations/image_info_test2015.json", } proposal_dets = 'refcoco+/proposal/res101_coco_minus_refer_notime_dets.json' proposal_masks = 'refcoco+/proposal/res101_coco_minus_refer_notime_masks.json' self.vg_proposal = ("vgbua_res101_precomputed", "trainval2014_resnet101_faster_rcnn_genome") self.proposal_source = proposal_source self.boxes = boxes self.test_mode = test_mode self.category_to_idx = {c: i for i, c in enumerate(categories)} self.data_path = data_path self.root_path = root_path self.transform = transform self.image_sets = [iset.strip() for iset in image_set.split('+')] self.coco = COCO(annotation_file=os.path.join( data_path, coco_annot_files['train2014'])) self.refer = REFER(data_path, dataset='refcoco+', splitBy='unc') self.refer_ids = [] for iset in self.image_sets: self.refer_ids.extend(self.refer.getRefIds(split=iset)) self.refs = self.refer.loadRefs(ref_ids=self.refer_ids) if 'proposal' in boxes: with open(os.path.join(data_path, proposal_dets), 'r') as f: proposal_list = json.load(f) self.proposals = {} for proposal in proposal_list: image_id = proposal['image_id'] if image_id in self.proposals: self.proposals[image_id].append(proposal['box']) else: self.proposals[image_id] = [proposal['box']] self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.mask_size = mask_size if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) if zip_mode: self.zipreader = ZipReader() self.database = self.load_annotations() if self.aspect_grouping: self.group_ids = self.group_aspect(self.database)
def __init__(self, root_path=None, image_set='train', transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, mask_size=(14, 14), aspect_grouping=False, **kwargs): """ Visual Question Answering Dataset :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param mask_size: size of instance mask of each object :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(CLS3, self).__init__() cache_dir = False assert not cache_mode, 'currently not support cache mode!' categories = [ '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove', 'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush' ] self.category_to_idx = {c: i for i, c in enumerate(categories)} self.data_split = image_set # HACK: reuse old parameter self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)") self.commaStrip = re.compile("(\d)(\,)(\d)") self.punct = [ ';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_', '-', '>', '<', '@', '`', ',', '?', '!' ] self.test_mode = test_mode self.root_path = root_path self.box_bank = {} self.transform = transform self.zip_mode = zip_mode self.aspect_grouping = aspect_grouping self.add_image_as_a_box = add_image_as_a_box self.cache_dir = os.path.join(root_path, 'cache') # return_offsets_mapping model_name = 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name self.fast_tokenizer = AutoTokenizer.from_pretrained( 'bert-base-uncased', cache_dir=self.cache_dir, use_fast=True, return_offsets_mapping=True) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( model_name, cache_dir=self.cache_dir) self.max_txt_token = 128 if zip_mode: self.zipreader = ZipReader() self.anno_aug = 'anno_aug' in kwargs self.database = self.load_annotations() self.use_img_box = True self.random_drop_tags = False
def __init__(self, captions_set, ann_file, roi_set, image_set, root_path, data_path, small_version=False, negative_sampling='hard', phrase_cls=True, transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, basic_tokenizer=None, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=True, on_memory=False, **kwargs): """ Visual Grounded Paraphrase Dataset :param ann_file: annotation csv file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param kwargs: """ super(VGPDataset, self).__init__() # temperarily enable cache mode and see if it works # assert not cache_mode, 'currently not support cache mode!' self.data_path = data_path self.root_path = root_path self.captions_set = os.path.join(data_path, captions_set) self.ann_file = os.path.join(data_path, ann_file) self.roi_set = os.path.join(data_path, roi_set) self.image_set = os.path.join(self.data_path, image_set) self.small = small_version self.neg_sampling = negative_sampling self.phrase_cls = phrase_cls self.transform = transform self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box self.on_memory = False # mode True doesn't work if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \ else BasicTokenizer(do_lower_case=True) if tokenizer is None: if pretrained_model_name is None: pretrained_model_name = 'bert-base-uncased' if 'roberta' in pretrained_model_name: tokenizer = RobertaTokenizer.from_pretrained( pretrained_model_name) else: tokenizer = BertTokenizer.from_pretrained( pretrained_model_name) self.tokenizer = tokenizer if zip_mode: self.zipreader = ZipReader() self.database = self.load_captions(self.captions_set)
def __init__(self, config): super(ResNetVLBERT, self).__init__(config) self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS if not config.NETWORK.BLIND: self.image_feature_extractor = FastRCNN(config, average_pool=True, final_dim=config.NETWORK.IMAGE_FINAL_DIM, enable_cnn_reg_loss=self.enable_cnn_reg_loss) if config.NETWORK.VLBERT.object_word_embed_mode == 1: self.object_linguistic_embeddings = nn.Embedding(81, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 2: self.object_linguistic_embeddings = nn.Embedding(1, config.NETWORK.VLBERT.hidden_size) elif config.NETWORK.VLBERT.object_word_embed_mode == 3: self.object_linguistic_embeddings = None else: raise NotImplementedError self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN self.tokenizer = BertTokenizer.from_pretrained(config.NETWORK.BERT_MODEL_NAME) language_pretrained_model_path = None if config.NETWORK.BERT_PRETRAINED != '': language_pretrained_model_path = '{}-{:04d}.model'.format(config.NETWORK.BERT_PRETRAINED, config.NETWORK.BERT_PRETRAINED_EPOCH) elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME): weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME) if os.path.isfile(weight_path): language_pretrained_model_path = weight_path self.language_pretrained_model_path = language_pretrained_model_path if language_pretrained_model_path is None: print("Warning: no pretrained language model found, training from scratch!!!") # Also pass the finetuning strategy self.vlbert = VisualLinguisticBert(config.NETWORK.VLBERT, language_pretrained_model_path=language_pretrained_model_path, finetune_strategy=config.FINETUNE_STRATEGY) # self.hm_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) # self.hi_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size) dim = config.NETWORK.VLBERT.hidden_size if config.NETWORK.CLASSIFIER_TYPE == "2fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE), torch.nn.ReLU(inplace=True), torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, config.DATASET.ANSWER_VOCAB_SIZE), ) elif config.NETWORK.CLASSIFIER_TYPE == "1fc": self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), torch.nn.Linear(dim, config.DATASET.ANSWER_VOCAB_SIZE) ) elif config.NETWORK.CLASSIFIER_TYPE == 'mlm': transform = BertPredictionHeadTransform(config.NETWORK.VLBERT) linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.DATASET.ANSWER_VOCAB_SIZE) self.final_mlp = nn.Sequential( transform, nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False), linear ) else: raise ValueError("Not support classifier type: {}!".format(config.NETWORK.CLASSIFIER_TYPE)) # init weights self.init_weight() self.fix_params()
def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64, with_precomputed_visual_feat=False, mask_raw_pixels=True, with_rel_task=True, with_mlm_task=True, with_mvrc_task=True, transform=None, test_mode=False, zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True, tokenizer=None, pretrained_model_name=None, add_image_as_a_box=False, aspect_grouping=False, **kwargs): """ Conceptual Captions Dataset :param ann_file: annotation jsonl file :param image_set: image folder name, e.g., 'vcr1images' :param root_path: root path to cache database loaded from annotation file :param data_path: path to vcr dataset :param transform: transform :param test_mode: test mode means no labels available :param zip_mode: reading images and metadata in zip archive :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM :param ignore_db_cache: ignore previous cached database, reload it from annotation file :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert :param add_image_as_a_box: add whole image as a box :param aspect_grouping: whether to group images via their aspect :param kwargs: """ super(COCOCaptionsDataset, self).__init__() assert not cache_mode, 'currently not support cache mode!' assert not test_mode annot = { 'train': 'annotations/captions_train2017.json', 'val': 'annotations/captions_val2017.json' } annot_inst = { 'train': 'annotations/instances_train2017.json', 'val': 'annotations/instances_val2017.json' } if zip_mode: self.root = os.path.join(data_path, '{0}2017.zip@/{0}2017'.format(image_set)) else: self.root = os.path.join(data_path, '{}2017'.format(image_set)) self.seq_len = seq_len self.with_rel_task = with_rel_task self.with_mlm_task = with_mlm_task self.with_mvrc_task = with_mvrc_task self.data_path = data_path self.root_path = root_path self.ann_file = os.path.join(data_path, annot[image_set]) self.ann_file_inst = os.path.join(data_path, annot_inst[image_set]) self.with_precomputed_visual_feat = with_precomputed_visual_feat self.mask_raw_pixels = mask_raw_pixels self.image_set = image_set self.transform = transform self.test_mode = test_mode self.zip_mode = zip_mode self.cache_mode = cache_mode self.cache_db = cache_db self.ignore_db_cache = ignore_db_cache self.aspect_grouping = aspect_grouping self.cache_dir = os.path.join(root_path, 'cache') self.add_image_as_a_box = add_image_as_a_box if not os.path.exists(self.cache_dir): makedirsExist(self.cache_dir) self.tokenizer = tokenizer if tokenizer is not None \ else BertTokenizer.from_pretrained( 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name, cache_dir=self.cache_dir) if self.zip_mode: self.zipreader = ZipReader() self.coco = COCO(self.ann_file) self.coco_inst = COCO(self.ann_file_inst) self.ids = list(sorted(self.coco.imgs.keys())) # filter images without detection annotations self.ids = [ img_id for img_id in self.ids if len(self.coco_inst.getAnnIds(imgIds=img_id, iscrowd=None)) > 0 ] self.json_category_id_to_contiguous_id = { v: i + 1 for i, v in enumerate(self.coco_inst.getCatIds()) } self.contiguous_category_id_to_json_id = { v: k for k, v in self.json_category_id_to_contiguous_id.items() } self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} if self.aspect_grouping: assert False, "not support aspect grouping currently!" # self.group_ids = self.group_aspect(self.database) print('mask_raw_pixels: ', self.mask_raw_pixels)