def __init__(self, config):

        super(ResNetVLBERTForPretrainingEncDecGenerate, self).__init__(config)

        self.image_feature_extractor = FastRCNN(
            config,
            average_pool=True,
            final_dim=config.NETWORK.IMAGE_FINAL_DIM,
            enable_cnn_reg_loss=False)
        self.object_linguistic_embeddings = nn.Embedding(
            1, config.NETWORK.VLBERT.hidden_size)
        if config.NETWORK.IMAGE_FEAT_PRECOMPUTED:
            self.object_mask_visual_embedding = nn.Embedding(1, 2048)
        if config.NETWORK.WITH_MVRC_LOSS:
            self.object_mask_word_embedding = nn.Embedding(
                1, config.NETWORK.VLBERT.hidden_size)
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN
        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)
        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path

        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBertEncoder(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=None
            if config.NETWORK.VLBERT.from_scratch else
            language_pretrained_model_path,
            with_rel_head=False,
            with_mlm_head=False,
            with_mvrc_head=False,
        )

        # FM addition: add decoder
        self.decoder = VisualLinguisticBertForPretrainingDecoder(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=None
            if config.NETWORK.VLBERT.from_scratch else
            language_pretrained_model_path,
            with_rel_head=config.NETWORK.WITH_REL_LOSS,
            with_mlm_head=config.NETWORK.WITH_MLM_LOSS,
            with_mvrc_head=config.NETWORK.WITH_MVRC_LOSS,
        )

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 2
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)

        self.image_feature_extractor = FastRCNN(
            config,
            average_pool=True,
            final_dim=config.NETWORK.IMAGE_FINAL_DIM,
            enable_cnn_reg_loss=False)
        self.object_linguistic_embeddings = nn.Embedding(
            1, config.NETWORK.VLBERT.hidden_size)
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN
        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path
        self.language_pretrained_model_path = language_pretrained_model_path
        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBert(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=language_pretrained_model_path)

        transform = VisualLinguisticBertMVRCHeadTransform(
            config.NETWORK.VLBERT)
        # self.linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, 768) #331 1000 35 100 12003 lihui
        # self.OIM_loss = OIM_Module(331, 768)  # config.NETWORK.VLBERT.hidden_size)
        self.OIM_loss = OIM_Module(12003, 768)
        self.linear = nn.Sequential(
            # transform,
            nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
            nn.Linear(config.NETWORK.VLBERT.hidden_size,
                      768)  #331 1000 35 100 12003 lihui
        )

        linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, 1)
        self.final_mlp = nn.Sequential(
            transform,
            nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
            linear)

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 3
0
    def __init__(self, config):

        super(ResNetVLBERTForPretrainingMultitaskNoVision,
              self).__init__(config)

        # Constructs/initialises model elements
        self.image_feature_extractor = FastRCNN(
            config,
            average_pool=True,
            final_dim=config.NETWORK.IMAGE_FINAL_DIM,
            enable_cnn_reg_loss=False)
        self.object_linguistic_embeddings = nn.Embedding(
            1, config.NETWORK.VLBERT.hidden_size)
        if config.NETWORK.IMAGE_FEAT_PRECOMPUTED or (
                not config.NETWORK.MASK_RAW_PIXELS):
            self.object_mask_visual_embedding = nn.Embedding(1, 2048)
        if config.NETWORK.WITH_MVRC_LOSS:
            self.object_mask_word_embedding = nn.Embedding(
                1, config.NETWORK.VLBERT.hidden_size)
        self.aux_text_visual_embedding = nn.Embedding(
            1, config.NETWORK.VLBERT.hidden_size)
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN
        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)

        # Can specify pre-trained model or use the downloaded pretrained model specific in .yaml file
        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            # language_pretrained_model_path = '{}-{:04d}.model'.format(config.NETWORK.BERT_PRETRAINED,
            #                                                           config.NETWORK.BERT_PRETRAINED_EPOCH)
            #FM edit: just use path of pretrained model
            language_pretrained_model_path = config.NETWORK.BERT_PRETRAINED
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path

        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBertForPretraining(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=None
            if config.NETWORK.VLBERT.from_scratch else
            language_pretrained_model_path,
            with_rel_head=config.NETWORK.WITH_REL_LOSS,
            with_mlm_head=config.NETWORK.WITH_MLM_LOSS,
            with_mvrc_head=config.NETWORK.WITH_MVRC_LOSS,
            with_MLT_head=config.NETWORK.WITH_MLT_LOSS)

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 4
0
    def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official',
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, **kwargs):
        """
        VREP Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(VRep, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        self.data_json = 'obj_det_res.json'#'image_seg_test.json'#'obj_det_res.json'
        self.ref_json = 'ref_annotations.json'
        self.boxes = boxes
        self.refer = Refer()
        self.test_mode = test_mode
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)
Exemplo n.º 5
0
    def __init__(self, ann_file, image_set, root_path, data_path, transform=None, task='Q2A', test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 basic_tokenizer=None, tokenizer=None, pretrained_model_name=None,
                 only_use_relevant_dets=False, add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, basic_align=False, qa2r_noq=False, qa2r_aug=False,
                 seq_len=64,
                 **kwargs):
        """
        Visual Commonsense Reasoning Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param task: 'Q2A' means question to answer, 'QA2R' means question and answer to rationale,
                     'Q2AR' means question to answer and rationale
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param only_use_relevant_dets: filter out detections not used in query and response
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param basic_align: align to tokens retokenized by basic_tokenizer
        :param qa2r_noq: in QA->R, the query contains only the correct answer, without question
        :param qa2r_aug: in QA->R, whether to augment choices to include those with wrong answer in query
        :param kwargs:
        """
        super(TwitterDataset, self).__init__()
        self.cache_dir = os.path.join(root_path, 'cache')
        assert not cache_mode, 'currently not support cache mode!'
        
        self.data_path = data_path
        self.test_mode = test_mode
        self.ann_file = os.path.join(text_path, ann_file)
        self.image_set = image_set
        self.transform = transform
        self.cache_mode = cache_mode
        self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \
            else BasicTokenizer(do_lower_case=True)
        if tokenizer is None:
            if pretrained_model_name is None:
                pretrained_model_name = 'bert-base-uncased'
            if 'roberta' in pretrained_model_name:
                tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
            else:
                tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
        self.tokenizer = tokenizer
        self.database = self.load_annotations(self.ann_file)
Exemplo n.º 6
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)
        self.config = config
        self.pre_resnet = resnet152()
        self.pre_resnet.load_state_dict(torch.load('/home/data/datasets/resnet152-b121ed2d.pth'))
        print('load resnet152 pretrained rpbert')
        self.object_visual_embeddings = nn.Linear(2048, config.NETWORK.VLBERT.hidden_size)
        self.object_linguistic_embeddings = nn.Embedding(1, config.NETWORK.VLBERT.hidden_size)
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN
        self.tokenizer = BertTokenizer.from_pretrained(config.NETWORK.BERT_MODEL_NAME)
        self.vlbert = VisualLinguisticBert(config.NETWORK.VLBERT)

        # init weights
        self.init_weight()
Exemplo n.º 7
0
    def __init__(self, ann_file, pretrained_model_name, tokenizer=None, seq_len=64,
                 encoding="utf-8", on_memory=True,
                 **kwargs):
        assert on_memory, "only support on_memory mode!"

        self.tokenizer = tokenizer if tokenizer is not None else BertTokenizer.from_pretrained(pretrained_model_name)
        self.vocab = self.tokenizer.vocab
        self.seq_len = seq_len
        self.on_memory = on_memory
        self.ann_file = ann_file
        self.encoding = encoding
        self.test_mode = False

        # load samples into memory
        if on_memory:
            self.corpus = self.load_corpus()
Exemplo n.º 8
0
    def __init__(self,
                 flickr_root,
                 snlive_root,
                 annotations_file,
                 image_set,
                 roi_set,
                 transform=None,
                 test_mode=False,
                 basic_tokenizer=None,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=True,
                 **kwargs):
        """
        Visual Grounded Dataset

        :param image_set: image folder name, e.g., 'vcr1images'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param kwargs:
        """
        super(SnliVEDataset, self).__init__()

        self.annotations_file = os.path.join(snlive_root, annotations_file)
        self.image_set = os.path.join(flickr_root, image_set)
        self.roi_set = os.path.join(flickr_root, roi_set)
        self.transform = transform
        self.test_mode = test_mode
        self.add_image_as_a_box = add_image_as_a_box
        self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \
            else BasicTokenizer(do_lower_case=True)
        if tokenizer is None:
            if pretrained_model_name is None:
                pretrained_model_name = 'bert-base-uncased'
            if 'roberta' in pretrained_model_name:
                tokenizer = RobertaTokenizer.from_pretrained(
                    pretrained_model_name)
            else:
                tokenizer = BertTokenizer.from_pretrained(
                    pretrained_model_name)
        self.tokenizer = tokenizer

        self.database = self.load_captions()
Exemplo n.º 9
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)

        self.image_feature_extractor = FastRCNN(
            config,
            average_pool=True,
            final_dim=config.NETWORK.IMAGE_FINAL_DIM,
            enable_cnn_reg_loss=False)
        self.object_linguistic_embeddings = nn.Embedding(
            1, config.NETWORK.VLBERT.hidden_size)
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN
        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path
        self.language_pretrained_model_path = language_pretrained_model_path
        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBert(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=language_pretrained_model_path)

        self.task1_head = Task1Head(config.NETWORK.VLBERT)
        self.task2_head = Task2Head(config.NETWORK.VLBERT)
        self.task3_head = Task3Head(config.NETWORK.VLBERT)

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 10
0
    def __init__(self, split, cfg, transform):
        super().__init__()
        self.split = split
        self.cfg = cfg
        self.transform = transform

        self.annotations = []
        n_img = 0
        for img in json.load(open(self.cfg.DATAPATH)):
            split = split + 'id' if split == 'val' else split  # 'val' -> 'valid'
            if img['split'] in split.split('_'):  # if img['split'] == split:
                n_img += 1
                for annot in img['annotations']:
                    if cfg.TEST.EXCL_LEFT_RIGHT and (
                            annot['predicate'] == 'to the left of'
                            or annot['predicate'] == 'to the right of'):
                        continue

                    annot['url'] = img['url']
                    annot['height'] = img['height']
                    annot['width'] = img['width']
                    annot['subject']['bbox'] = self.fix_bbox(
                        annot['subject']['bbox'], img['height'], img['width'])
                    annot['object']['bbox'] = self.fix_bbox(
                        annot['object']['bbox'], img['height'], img['width'])
                    self.annotations.append(annot)

        print('%d relations in %s' % (len(self.annotations), split))
        print('%d imgs in %s' % (n_img, split))

        self.cache_dir = os.path.join(cfg.DATASET.ROOT_PATH, 'cache')
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        if cfg.NETWORK.BERT_MODEL_NAME:
            print('Initializing BERT tokenizer from',
                  cfg.NETWORK.BERT_MODEL_NAME)
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased' if cfg.NETWORK.BERT_MODEL_NAME is None else
            cfg.NETWORK.BERT_MODEL_NAME,
            cache_dir=self.cache_dir)
Exemplo n.º 11
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)

        self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS
        self.cnn_loss_top = config.NETWORK.CNN_LOSS_TOP
        if not config.NETWORK.BLIND:
            self.image_feature_extractor = FastRCNN(
                config,
                average_pool=True,
                final_dim=config.NETWORK.IMAGE_FINAL_DIM,
                enable_cnn_reg_loss=(self.enable_cnn_reg_loss
                                     and not self.cnn_loss_top))
            if config.NETWORK.VLBERT.object_word_embed_mode == 1:
                self.object_linguistic_embeddings = nn.Embedding(
                    81, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 2:
                self.object_linguistic_embeddings = nn.Embedding(
                    1, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 3:
                self.object_linguistic_embeddings = None
            else:
                raise NotImplementedError
            if self.enable_cnn_reg_loss and self.cnn_loss_top:
                self.cnn_loss_reg = nn.Sequential(
                    VisualLinguisticBertMVRCHeadTransform(
                        config.NETWORK.VLBERT),
                    nn.Dropout(config.NETWORK.CNN_REG_DROPOUT, inplace=False),
                    nn.Linear(config.NETWORK.VLBERT.hidden_size, 81))
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN

        if 'roberta' in config.NETWORK.BERT_MODEL_NAME:
            self.tokenizer = RobertaTokenizer.from_pretrained(
                config.NETWORK.BERT_MODEL_NAME)
        else:
            self.tokenizer = BertTokenizer.from_pretrained(
                config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path

        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = TimeDistributed(
            VisualLinguisticBert(
                config.NETWORK.VLBERT,
                language_pretrained_model_path=language_pretrained_model_path))

        self.for_pretrain = config.NETWORK.FOR_MASK_VL_MODELING_PRETRAIN
        assert not self.for_pretrain, "Not implement pretrain mode now!"

        if not self.for_pretrain:
            dim = config.NETWORK.VLBERT.hidden_size
            if config.NETWORK.CLASSIFIER_TYPE == "2fc":
                self.final_mlp = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(dim,
                                    config.NETWORK.CLASSIFIER_HIDDEN_SIZE),
                    torch.nn.ReLU(inplace=True),
                    torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, 1),
                )
            elif config.NETWORK.CLASSIFIER_TYPE == "1fc":
                self.final_mlp = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                     inplace=False), torch.nn.Linear(dim, 1))
            else:
                raise ValueError("Not support classifier type: {}!".format(
                    config.NETWORK.CLASSIFIER_TYPE))

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 12
0
    def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64,
                 with_precomputed_visual_feat=False, mask_raw_pixels=True,
                 with_rel_task=True, with_mlm_task=False, with_mvrc_task=False,
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False, languages_used='first', MLT_vocab='bert-base-german-cased-vocab.txt', **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Multi30kDataset2018, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        # TODO: need to remove this to allows testing
        # assert not test_mode

        annot = {'train': 'train_MLT_frcnn.json',
                 'val': 'val_MLT_frcnn.json',
                 'test2015': 'test_MLT_2018_renamed_frcnn.json'}

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        #FM edit: added option for how many captions
        self.languages_used = languages_used
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset
        self.database = list(jsonlines.open(self.ann_file))
        if not self.zip_mode:
            for i, idb in enumerate(self.database):
                self.database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\
                    .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '')
                self.database[i]['image'] = idb['image'].replace('.zip@', '')


        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)

        #FM: initialise vocabulary for output
        self.MLT_vocab_path = os.path.join(root_path, 'model/pretrained_model', MLT_vocab)
        self.MLT_vocab = []
        with open(self.MLT_vocab_path) as fp:
            for cnt, line in enumerate(fp):
                self.MLT_vocab.append(line.strip())
    def __init__(self, image_set, root_path, data_path, answer_vocab_file, use_imdb=True,
                 with_precomputed_visual_feat=False, boxes="36",
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=True, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, toy_dataset=False, toy_samples=128, **kwargs):
        """
        Visual Question Answering Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(VQA_CP, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
                      'boat',
                      'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse',
                      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
                      'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove',
                      'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon',
                      'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut',
                      'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse',
                      'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
                      'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush']
        vqa_question = {
            "train": "vqa/vqacp_v2_train_questions.json",
            "val": "vqa/vqacp_v2_test_questions.json",
        }
        vqa_annot = {
            "train": "vqa/vqacp_v2_train_annotations.json",
            "val": "vqa/vqacp_v2_test_annotations.json",
        }
        
        if boxes == "36":
            precomputed_boxes = {
                'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"),
                'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome_36"),
            }
        elif boxes == "10-100ada":
            precomputed_boxes = {
                'train': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"),
                'val': ("vgbua_res101_precomputed", "{}_resnet101_faster_rcnn_genome"),
            }
        else:
            raise ValueError("Not support boxes: {}!".format(boxes))

        self.coco_dataset = {
            "train2014": os.path.join(data_path, "annotations", "instances_train2014.json"),
            "val2014": os.path.join(data_path, "annotations", "instances_val2014.json"),
            "test-dev2015": os.path.join(data_path, "annotations", "image_info_test-dev2015.json"),
            "test2015": os.path.join(data_path, "annotations", "image_info_test2015.json"),
        }

        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip = re.compile("(\d)(\,)(\d)")
        self.punct = [';', r"/", '[', ']', '"', '{', '}',
                      '(', ')', '=', '+', '\\', '_', '-',
                      '>', '<', '@', '`', ',', '?', '!']

        self.boxes = boxes
        self.test_mode = test_mode
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path

        # load the answer vocab file: same as vqav2 dataset
        with open(answer_vocab_file, 'r', encoding='utf8') as f:
            self.answer_vocab = [w.lower().strip().strip('\r').strip('\n').strip('\r') for w in f.readlines()]
            self.answer_vocab = list(filter(lambda x: x != '', self.answer_vocab))
            self.answer_vocab = [self.processPunctuation(w) for w in self.answer_vocab]

        # The config.DATA.TRAIN_IMAGE_SET and config.DATA.VAL_IMAGE_SET have
        # a little different use here, it indicates the mode 'train' or 'val'
        self.image_sets = [iset.strip() for iset in image_set.split('+')]
        self.ann_files = [os.path.join(data_path, vqa_annot[iset]) for iset in self.image_sets] \
            if not self.test_mode else [None for iset in self.image_sets]
        self.q_files = [os.path.join(data_path, vqa_question[iset]) for iset in self.image_sets]

        self.precomputed_box_files = [
            os.path.join(data_path, precomputed_boxes[iset][0], precomputed_boxes[iset][1]) for iset in self.image_sets]

        self.box_bank = {}
        self.coco_datasets = [os.path.join(data_path, '{}', 'COCO_{}_{{:012d}}.jpg') for iset in self.image_sets]

        self.transform = transform
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size

        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)

        # toy dataset
        if toy_dataset:
            print(f"Using the toy dataset!! Total samples = {toy_samples}")
            self.database = self.database[:toy_samples]
Exemplo n.º 14
0
    def __init__(self,
                 ann_file,
                 image_set,
                 root_path,
                 data_path,
                 seq_len=64,
                 with_precomputed_visual_feat=False,
                 mask_raw_pixels=True,
                 with_rel_task=True,
                 with_mlm_task=True,
                 with_mvrc_task=True,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False,
                 **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(ParallelTextDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert not test_mode

        annot = {
            'train': 'train.json',
            'val': 'test.json',
            'test': 'test.json'
        }

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset
        self.database = list(jsonlines.open(self.ann_file))

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)
Exemplo n.º 15
0
    def __init__(self, split, cfg, transform):
        super().__init__()
        self.split = split
        self.cfg = cfg
        self.transform = transform

        self.all_proposals_test = False
        if cfg.DATASET.ALL_PROPOSALS_TEST:
            self.all_proposals_test = True

        self.annotations = []

        # Load images
        self.path = self.cfg.TEST_PATH if split == 'test' else self.cfg.TRAIN_VAL_PATH
        imgs = json.load(open(self.path))

        skipped_count = 0
        for img in imgs:
            if img['path'].endswith('.png'):
                img['path'] = '.'.join([img['path'].split('.')[0], 'jpg'])

            rels_cand = None
            if self.all_proposals_test and split != 'train':
                rels_cand = []
                nb_of_objs = len(img['objects'])
                if nb_of_objs > cfg.DATASET.MAX_NB_OF_OBJ:
                    nb_of_objs = min(cfg.DATASET.MAX_NB_OF_OBJ, nb_of_objs)
                    skipped_count += 1
                for sub_id in range(0, nb_of_objs):
                    for obj_id in range(0, nb_of_objs):
                        if sub_id == obj_id: continue
                        rels_cand.append((sub_id, obj_id))

            annot = {
                'img_path': img['path'],
                'annot': img['relationships'],
                'objects': img['objects'],
                'rels_cand': rels_cand,
            }

            self.annotations.append(annot)

        print(
            f'number of imgs with skipped objs (skipped_count): {skipped_count}'
        )
        print('%d imgs in %s' % (len(self.annotations), split))

        # categories
        self.num_object_classes = len(self.cfg.OBJECT_CATEGORIES)
        self._object_class_to_ind = dict(
            zip(self.cfg.OBJECT_CATEGORIES, range(self.num_object_classes)))
        self.num_predicate_classes = len(self.cfg.PREDICATE_CATEGORIES)
        self._predicate_class_to_ind = dict(
            zip(self.cfg.PREDICATE_CATEGORIES,
                range(self.num_predicate_classes)))

        self.cache_dir = os.path.join(cfg.DATASET.ROOT_PATH, 'cache')
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased' if cfg.NETWORK.BERT_MODEL_NAME is None else
            cfg.NETWORK.BERT_MODEL_NAME,
            cache_dir=self.cache_dir)

        self.sample_rels = cfg.TRAIN.SAMPLE_RELS
Exemplo n.º 16
0
    def __init__(self,
                 root_path,
                 data_path,
                 boxes='gt',
                 proposal_source='official',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        Foil Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Foil, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        coco_annot_files = {
            "train2014": "annotations/instances_train2014.json",
            "val2014": "annotations/instances_val2014.json",
            "test2015": "annotations/image_info_test2015.json",
        }

        foil_annot_files = {
            "train": "foil/foilv1.0_train_2017.json",
            "test": "foil/foilv1.0_test_2017.json"
        }

        foil_vocab_file = "foil/vocab.txt"

        self.vg_proposal = ("vgbua_res101_precomputed",
                            "trainval2014_resnet101_faster_rcnn_genome")

        self.test_mode = test_mode
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform

        vocab_file = open(os.path.join(data_path, foil_vocab_file), 'r')
        vocab_lines = vocab_file.readlines()
        vocab_lines = [v.strip() for v in vocab_lines]
        self.itos = vocab_lines
        self.stoi = dict(list(zip(self.itos, range(len(vocab_lines)))))

        if self.test_mode:
            self.image_set = "val2014"
            coco_annot_file = coco_annot_files["val2014"]
        else:
            self.image_set = "train2014"
            coco_annot_file = coco_annot_files["train2014"]

        self.coco = COCO(
            annotation_file=os.path.join(data_path, coco_annot_file))
        self.foil = FOIL(data_path, 'train' if not test_mode else 'test')
        self.foil_ids = list(self.foil.Foils.keys())
        self.foils = self.foil.loadFoils(foil_ids=self.foil_ids)
        if 'proposal' in boxes:
            with open(os.path.join(data_path, proposal_dets), 'r') as f:
                proposal_list = json.load(f)
            self.proposals = {}
            for proposal in proposal_list:
                image_id = proposal['image_id']
                if image_id in self.proposals:
                    self.proposals[image_id].append(proposal['box'])
                else:
                    self.proposals[image_id] = [proposal['box']]
        self.boxes = boxes
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)
Exemplo n.º 17
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)

        self.predict_on_cls = config.NETWORK.VLBERT.predict_on_cls  # make prediction on [CLS]?

        self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS
        if not config.NETWORK.BLIND:
            self.image_feature_extractor = FastRCNN(
                config,
                average_pool=True,
                final_dim=config.NETWORK.IMAGE_FINAL_DIM,
                enable_cnn_reg_loss=self.enable_cnn_reg_loss)
            if config.NETWORK.VLBERT.object_word_embed_mode == 1:
                self.object_linguistic_embeddings = nn.Embedding(
                    81, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 2:  # default: class-agnostic
                self.object_linguistic_embeddings = nn.Embedding(
                    1, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 3:
                self.object_linguistic_embeddings = None
            else:
                raise NotImplementedError
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN

        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path
        self.language_pretrained_model_path = language_pretrained_model_path
        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBert(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=language_pretrained_model_path)

        dim = config.NETWORK.VLBERT.hidden_size
        if config.NETWORK.CLASSIFIER_TYPE == "2fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE),
                torch.nn.ReLU(inplace=True),
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE,
                                config.DATASET.ANSWER_VOCAB_SIZE),
            )
        elif config.NETWORK.CLASSIFIER_TYPE == "1fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(dim, config.DATASET.ANSWER_VOCAB_SIZE))
        elif config.NETWORK.CLASSIFIER_TYPE == 'mlm':
            transform = BertPredictionHeadTransform(config.NETWORK.VLBERT)
            linear = nn.Linear(config.NETWORK.VLBERT.hidden_size,
                               config.DATASET.ANSWER_VOCAB_SIZE)
            self.final_mlp = nn.Sequential(
                transform,
                nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                linear)
        else:
            raise ValueError("Not support classifier type: {}!".format(
                config.NETWORK.CLASSIFIER_TYPE))

        self.use_spatial_model = False
        if config.NETWORK.USE_SPATIAL_MODEL:
            self.use_spatial_model = True
            # self.simple_spatial_model = SimpleSpatialModel(4, config.NETWORK.VLBERT.hidden_size, 9, config)

            self.use_coord_vector = False
            if config.NETWORK.USE_COORD_VECTOR:
                self.use_coord_vector = True
                self.loc_fcs = nn.Sequential(
                    nn.Linear(2 * 5 + 9, config.NETWORK.VLBERT.hidden_size),
                    nn.ReLU(True),
                    nn.Linear(config.NETWORK.VLBERT.hidden_size,
                              config.NETWORK.VLBERT.hidden_size))
            else:
                self.simple_spatial_model = SimpleSpatialModel(
                    4, config.NETWORK.VLBERT.hidden_size, 9)

            self.spa_add = True if config.NETWORK.SPA_ADD else False
            self.spa_concat = True if config.NETWORK.SPA_CONCAT else False

            if self.spa_add:
                self.spa_feat_weight = 0.5
                if config.NETWORK.USE_SPA_WEIGHT:
                    self.spa_feat_weight = config.NETWORK.SPA_FEAT_WEIGHT
                self.spa_fusion_linear = nn.Linear(
                    config.NETWORK.VLBERT.hidden_size,
                    config.NETWORK.VLBERT.hidden_size)
            elif self.spa_concat:
                if self.use_coord_vector:
                    self.spa_fusion_linear = nn.Linear(
                        config.NETWORK.VLBERT.hidden_size +
                        config.NETWORK.VLBERT.hidden_size,
                        config.NETWORK.VLBERT.hidden_size)
                else:
                    self.spa_fusion_linear = nn.Linear(
                        config.NETWORK.VLBERT.hidden_size * 2,
                        config.NETWORK.VLBERT.hidden_size)
            self.spa_linear = nn.Linear(config.NETWORK.VLBERT.hidden_size,
                                        config.NETWORK.VLBERT.hidden_size)
            self.dropout = nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT)

            self.spa_one_more_layer = config.NETWORK.SPA_ONE_MORE_LAYER
            if self.spa_one_more_layer:
                self.spa_linear_hidden = nn.Linear(
                    config.NETWORK.VLBERT.hidden_size,
                    config.NETWORK.VLBERT.hidden_size)

        self.enhanced_img_feature = False
        if config.NETWORK.VLBERT.ENHANCED_IMG_FEATURE:
            self.enhanced_img_feature = True
            self.mask_weight = config.NETWORK.VLBERT.mask_weight
            self.mask_loss_sum = config.NETWORK.VLBERT.mask_loss_sum
            self.mask_loss_mse = config.NETWORK.VLBERT.mask_loss_mse
            self.no_predicate = config.NETWORK.VLBERT.NO_PREDICATE

        self.all_proposals_test = False
        if config.DATASET.ALL_PROPOSALS_TEST:
            self.all_proposals_test = True

        self.use_uvtranse = False
        if config.NETWORK.USE_UVTRANSE:
            self.use_uvtranse = True
            self.union_vec_fc = nn.Linear(config.NETWORK.VLBERT.hidden_size,
                                          config.NETWORK.VLBERT.hidden_size)
            self.uvt_add = True if config.NETWORK.UVT_ADD else False
            self.uvt_concat = True if config.NETWORK.UVT_CONCAT else False
            if not (self.uvt_add ^ self.uvt_concat):
                assert False
            if self.uvt_add:
                self.uvt_feat_weight = config.NETWORK.UVT_FEAT_WEIGHT
                self.uvt_fusion_linear = nn.Linear(
                    config.NETWORK.VLBERT.hidden_size,
                    config.NETWORK.VLBERT.hidden_size)
            elif self.uvt_concat:
                self.uvt_fusion_linear = nn.Linear(
                    config.NETWORK.VLBERT.hidden_size * 2,
                    config.NETWORK.VLBERT.hidden_size)
            self.uvt_linear = nn.Linear(config.NETWORK.VLBERT.hidden_size,
                                        config.NETWORK.VLBERT.hidden_size)
            self.dropout_uvt = nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT)

        # init weights
        self.init_weight()
Exemplo n.º 18
0
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)
        self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS
        self.cnn_loss_top = config.NETWORK.CNN_LOSS_TOP
        self.align_caption_img = config.DATASET.ALIGN_CAPTION_IMG
        self.use_phrasal_paraphrases = config.DATASET.PHRASE_CLS
        self.supervise_attention = config.NETWORK.SUPERVISE_ATTENTION
        self.normalization = config.NETWORK.ATTENTION_NORM_METHOD
        self.ewc_reg = config.NETWORK.EWC_REG
        self.importance_hparam = 0.
        if config.NETWORK.EWC_REG:
            self.fisher = pickle.load(open(config.NETWORK.FISHER_PATH, "rb"))
            self.pretrain_param = torch.load(config.NETWORK.PARAM_PRETRAIN)
            self.importance_hparam = config.NETWORK.EWC_IMPORTANCE
        if not config.NETWORK.BLIND:
            self.image_feature_extractor = FastRCNN(
                config,
                average_pool=True,
                final_dim=config.NETWORK.IMAGE_FINAL_DIM,
                enable_cnn_reg_loss=(self.enable_cnn_reg_loss
                                     and not self.cnn_loss_top))
            if config.NETWORK.VLBERT.object_word_embed_mode == 1:
                self.object_linguistic_embeddings = nn.Embedding(
                    81, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 2:
                self.object_linguistic_embeddings = nn.Embedding(
                    1, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 3:
                self.object_linguistic_embeddings = None
            else:
                raise NotImplementedError

        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN

        if 'roberta' in config.NETWORK.BERT_MODEL_NAME:
            self.tokenizer = RobertaTokenizer.from_pretrained(
                config.NETWORK.BERT_MODEL_NAME)
        else:
            self.tokenizer = BertTokenizer.from_pretrained(
                config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path

        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBert(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=language_pretrained_model_path)

        self.for_pretrain = False
        dim = config.NETWORK.VLBERT.hidden_size
        if self.align_caption_img:
            sentence_logits_shape = 3
        else:
            sentence_logits_shape = 1
        if config.NETWORK.SENTENCE.CLASSIFIER_TYPE == "2fc":
            self.sentence_cls = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(
                    dim, config.NETWORK.SENTENCE.CLASSIFIER_HIDDEN_SIZE),
                torch.nn.ReLU(inplace=True),
                torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(config.NETWORK.SENTENCE.CLASSIFIER_HIDDEN_SIZE,
                                sentence_logits_shape),
            )
        elif config.NETWORK.SENTENCE.CLASSIFIER_TYPE == "1fc":
            self.sentence_cls = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.SENTENCE.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(dim, sentence_logits_shape))
        else:
            raise ValueError("Classifier type: {} not supported!".format(
                config.NETWORK.SENTENCE.CLASSIFIER_TYPE))

        if self.use_phrasal_paraphrases:
            if config.NETWORK.PHRASE.CLASSIFIER_TYPE == "2fc":
                self.phrasal_cls = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(
                        4 * dim, config.NETWORK.PHRASE.CLASSIFIER_HIDDEN_SIZE),
                    torch.nn.ReLU(inplace=True),
                    torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(
                        config.NETWORK.PHRASE.CLASSIFIER_HIDDEN_SIZE, 5),
                )
            elif config.NETWORK.PHRASE.CLASSIFIER_TYPE == "1fc":
                self.phrasal_cls = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.PHRASE.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(4 * dim, 5))
            else:
                raise ValueError("Classifier type: {} not supported!".format(
                    config.NETWORK.PHRASE.CLASSIFIER_TYPE))

        if self.supervise_attention == "indirect":
            if config.NETWORK.VG.CLASSIFIER_TYPE == "2fc":
                self.vg_cls = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(2 * dim,
                                    config.NETWORK.VG.CLASSIFIER_HIDDEN_SIZE),
                    torch.nn.ReLU(inplace=True),
                    torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(config.NETWORK.VG.CLASSIFIER_HIDDEN_SIZE,
                                    1),
                )
            elif config.NETWORK.VG.CLASSIFIER_TYPE == "1fc":
                self.vg_cls = torch.nn.Sequential(
                    torch.nn.Dropout(config.NETWORK.VG.CLASSIFIER_DROPOUT,
                                     inplace=False),
                    torch.nn.Linear(2 * dim, 1))
            else:
                raise ValueError("Classifier type: {} not supported!".format(
                    config.NETWORK.PHRASE.CLASSIFIER_TYPE))

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 19
0
    def __init__(self, ann_file, image_set, root_path, data_path, seq_len=64,
                 with_precomputed_visual_feat=False, mask_raw_pixels=True,
                 with_rel_task=True, with_mlm_task=False, with_mvrc_task=False,
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False, languages_used='first', **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Multi30kDataset_5x_Mixed, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        # TODO: need to remove this to allows testing
        # assert not test_mode

        annot = {'train': 'train_frcnn_5captions_both.json',
                 'val': 'val_frcnn.json',
                 'test2015': 'test_frcnn.json'}

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        #FM edit: added option for how many captions
        self.languages_used = languages_used
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.zipreader = ZipReader()

        # FM: Customise for multi30k dataset
        if not self.test_mode:
            self.database = list(jsonlines.open(self.ann_file))
            db_size = len(self.database)   
            print('**************')         
            print('Size before: ', db_size)         
            if not self.zip_mode:
                for i, idb in enumerate(self.database):
                    self.database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\
                        .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '')
                    self.database[i]['image'] = idb['image'].replace('.zip@', '')

            # double database - one is used for english one for german
            database_2 = copy.deepcopy(self.database)
            self.database = self.database + database_2
            print('**************')         
            print('Size after: ', len(self.database)) 
            for i, idb in enumerate(self.database):
                if i<db_size:
                    self.database[i]['lang'] = 'first'
                else:
                    self.database[i]['lang'] = 'second'
        # FM edit: create dataset for test mode 
        else:
            self.simple_database = list(jsonlines.open(self.ann_file))
            if not self.zip_mode:
                for i, idb in enumerate(self.simple_database):
                    self.simple_database[i]['frcnn'] = idb['frcnn'].replace('.zip@', '')\
                        .replace('.0', '').replace('.1', '').replace('.2', '').replace('.3', '')
                    self.simple_database[i]['image'] = idb['image'].replace('.zip@', '')
            # create database cross-coupling each caption with all images
            self.database = []
            db_index = 0
            for x, idb_x in enumerate(self.simple_database):
                for y, idb_y in enumerate(self.simple_database):                    
                    self.database.append({})
                    self.database[db_index]['label'] = 1.0 if x==y else 0.0
                    self.database[db_index]['caption_en'] = self.simple_database[x]['caption_en']
                    self.database[db_index]['caption_de'] = self.simple_database[x]['caption_de']
                    self.database[db_index]['image'] = self.simple_database[y]['image']
                    self.database[db_index]['frcnn'] = self.simple_database[y]['frcnn']
                    self.database[db_index]['caption_index'] = x
                    self.database[db_index]['image_index'] = y
                    db_index += 1

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)
Exemplo n.º 20
0
    def __init__(self, config):

        super(ResNetVLBERTv5, self).__init__(config)

        self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS
        if not config.NETWORK.BLIND:
            self.image_feature_extractor = FastRCNN(
                config,
                average_pool=True,
                final_dim=config.NETWORK.IMAGE_FINAL_DIM,
                enable_cnn_reg_loss=self.enable_cnn_reg_loss)
            if config.NETWORK.VLBERT.object_word_embed_mode == 1:
                self.object_linguistic_embeddings = nn.Embedding(
                    601, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 2:
                self.object_linguistic_embeddings = nn.Embedding(
                    1, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 3:
                self.object_linguistic_embeddings = None
            else:
                raise NotImplementedError
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN

        self.tokenizer = BertTokenizer.from_pretrained(
            config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(
                config.NETWORK.BERT_PRETRAINED,
                config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME,
                                       BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path
        self.language_pretrained_model_path = language_pretrained_model_path
        if language_pretrained_model_path is None:
            print(
                "Warning: no pretrained language model found, training from scratch!!!"
            )

        self.vlbert = VisualLinguisticBert(
            config.NETWORK.VLBERT,
            language_pretrained_model_path=language_pretrained_model_path)

        # self.hm_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size)
        # self.hi_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size)

        self.hidden_dropout = nn.Dropout(0.2)
        if config.NETWORK.VLBERT.num_hidden_layers == 24:
            self.gating = nn.Parameter(torch.tensor([
                0.0067, 0.0070, 0.0075, 0.0075, 0.0075, 0.0074, 0.0076, 0.0075,
                0.0076, 0.0080, 0.0079, 0.0086, 0.0096, 0.0101, 0.0104, 0.0105,
                0.0111, 0.0120, 0.0126, 0.0115, 0.0108, 0.0105, 0.0104, 0.0117
            ]),
                                       requires_grad=True)
        else:
            self.gating = nn.Parameter(
                torch.ones(config.NETWORK.VLBERT.num_hidden_layers, ) * 1e-2,
                requires_grad=True)
        self.train_steps = 0

        dim = config.NETWORK.VLBERT.hidden_size
        if config.NETWORK.CLASSIFIER_TYPE == "2fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE),
                torch.nn.ReLU(inplace=True),
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE,
                                config.NETWORK.CLASSIFIER_CLASS),
            )
        elif config.NETWORK.CLASSIFIER_TYPE == "1fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT,
                                 inplace=False),
                torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_CLASS))
        elif config.NETWORK.CLASSIFIER_TYPE == 'mlm':
            transform = BertPredictionHeadTransform(config.NETWORK.VLBERT)
            linear = nn.Linear(config.NETWORK.VLBERT.hidden_size,
                               config.NETWORK.CLASSIFIER_CLASS)
            self.final_mlp = nn.Sequential(
                transform,
                nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                linear)
        else:
            raise ValueError("Not support classifier type: {}!".format(
                config.NETWORK.CLASSIFIER_TYPE))

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 21
0
    def __init__(self, image_set, root_path, data_path, boxes='gt', proposal_source='official',
                 transform=None, test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 tokenizer=None, pretrained_model_name=None,
                 add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, **kwargs):
        """
        Market1501 Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(PA100K, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        self.vg_proposal = ("vgbua_res101_precomputed", "trainval2014_resnet101_faster_rcnn_genome")
        self.proposal_source = proposal_source
        self.boxes = boxes
        self.test_mode = test_mode

        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.image_sets = [iset.strip() for iset in image_set.split('+')]

        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.trainval_id_to_cls = {}
        self.image_nums = 0
        # self.imgid2entry = {}
        self.ps_map = {}
        self.imgid2psid = {}
        self.trainval_index_to_id = {}

        self.image_entries = []
        self.pa100k_attribute = self.generate_data_description()
        self.database = self.load_annotations(self.pa100k_attribute)
        # if self.aspect_grouping:
        #     self.group_ids = self.group_aspect(self.database)
        self.part = 7
        self.max_boxes = 7

        self.max_word = 26

        self.val_images = []
        self.val_boxes = []
        self.val_im_info = []
        self.val_ids = []
        self.val_feat = []

        self.diff = 2
Exemplo n.º 22
0
    def __init__(self, ann_file, image_set, root_path, data_path, transform=None, task='Q2A', test_mode=False,
                 zip_mode=False, cache_mode=False, cache_db=False, ignore_db_cache=True,
                 basic_tokenizer=None, tokenizer=None, pretrained_model_name=None,
                 only_use_relevant_dets=False, add_image_as_a_box=False, mask_size=(14, 14),
                 aspect_grouping=False, basic_align=False, qa2r_noq=False, qa2r_aug=False,
                 seq_len=64,
                 **kwargs):
        """
        Visual Commonsense Reasoning Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param task: 'Q2A' means question to answer, 'QA2R' means question and answer to rationale,
                     'Q2AR' means question to answer and rationale
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param only_use_relevant_dets: filter out detections not used in query and response
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param basic_align: align to tokens retokenized by basic_tokenizer
        :param qa2r_noq: in QA->R, the query contains only the correct answer, without question
        :param qa2r_aug: in QA->R, whether to augment choices to include those with wrong answer in query
        :param kwargs:
        """
        super(VCRDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert task in ['Q2A', 'QA2R', 'Q2AR'] , 'not support task {}'.format(task)
        assert not qa2r_aug, "Not implemented!"

        self.qa2r_noq = qa2r_noq
        self.qa2r_aug = qa2r_aug

        self.seq_len = seq_len

        categories = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
                      'trafficlight', 'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat', 'dog', 'horse',
                      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
                      'suitcase', 'frisbee', 'skis', 'snowboard', 'sportsball', 'kite', 'baseballbat', 'baseballglove',
                      'skateboard', 'surfboard', 'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife', 'spoon',
                      'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hotdog', 'pizza', 'donut',
                      'cake', 'chair', 'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv', 'laptop', 'mouse',
                      'remote', 'keyboard', 'cellphone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
                      'clock', 'vase', 'scissors', 'teddybear', 'hairdrier', 'toothbrush']
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, ann_file)
        self.image_set = image_set
        self.transform = transform
        self.task = task
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.basic_align = basic_align
        print('Dataset Basic Align: {}'.format(self.basic_align))
        self.cache_dir = os.path.join(root_path, 'cache')
        self.only_use_relevant_dets = only_use_relevant_dets
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \
            else BasicTokenizer(do_lower_case=True)
        if tokenizer is None:
            if pretrained_model_name is None:
                pretrained_model_name = 'bert-base-uncased'
            if 'roberta' in pretrained_model_name:
                tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
            else:
                tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, cache_dir=self.cache_dir)
        self.tokenizer = tokenizer

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations(self.ann_file)
        if self.aspect_grouping:
            assert False, "Not support aspect grouping now!"
            self.group_ids = self.group_aspect(self.database)

        self.person_name_id = 0
Exemplo n.º 23
0
    def __init__(self,
                 image_set,
                 root_path,
                 data_path,
                 boxes='gt',
                 proposal_source='official',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 parts=1,
                 number_sep=1,
                 part_methods='VS',
                 **kwargs):
        """
        RefCOCO+ Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(Pedes, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        self.pedes_annot_files = {
            "trainval": "trainval.json",
        }

        self.vg_proposal = ("vgbua_res101_precomputed",
                            "trainval2014_resnet101_faster_rcnn_genome")
        self.proposal_source = proposal_source
        self.boxes = boxes
        self.test_mode = test_mode

        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.image_sets = [iset.strip() for iset in image_set.split('+')]
        # self.coco = COCO(annotation_file=os.path.join(data_path, coco_annot_files['train2014']))
        # self.refer = REFER(data_path, dataset='refcoco+', splitBy='unc')
        # self.refer_ids = []
        # for iset in self.image_sets:
        #     self.refer_ids.extend(self.refer.getRefIds(split=iset))
        # self.refs = self.refer.loadRefs(ref_ids=self.refer_ids)

        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        self.trainval_id_to_cls = {}
        self.image_nums = 0
        self.imgid2entry = {}
        self.ps_map = {}
        self.imgid2psid = {}
        self.trainval_index_to_id = {}
        f = open(
            os.path.join(self.data_path, self.pedes_annot_files['trainval']))
        self.setting = json.load(f)
        self.database = self.load_annotations()
        # if self.aspect_grouping:
        #     self.group_ids = self.group_aspect(self.database)
        self.part = parts
        self.max_word = 50

        self.val_images = []
        self.val_boxes = []
        self.val_im_info = []
        self.val_ids = []
        self.val_feat = []
        self.diff = 7

        self.use_JPP = False
        if part_methods == 'KS': self.use_JPP = True

        self.number_sep = number_sep
        self.number_parts = self.number_sep * self.part - self.number_sep + 1

        if self.use_JPP:
            f_box = open(os.path.join(self.data_path,
                                      'result.json'))  #box_frcnn.json
            self.JPP_boxes = json.load(f_box)
Exemplo n.º 24
0
    def __init__(self,
                 image_set,
                 root_path,
                 data_path,
                 boxes='gt',
                 proposal_source='official',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        RefCOCO+ Dataset

        :param image_set: image folder name
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to dataset
        :param boxes: boxes to use, 'gt' or 'proposal'
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(RefCOCO, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'

        categories = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle',
            'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight',
            'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat',
            'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
            'frisbee', 'skis', 'snowboard', 'sportsball', 'kite',
            'baseballbat', 'baseballglove', 'skateboard', 'surfboard',
            'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair',
            'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv',
            'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave',
            'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
            'scissors', 'teddybear', 'hairdrier', 'toothbrush'
        ]

        coco_annot_files = {
            "train2014": "annotations/instances_train2014.json",
            "val2014": "annotations/instances_val2014.json",
            "test2015": "annotations/image_info_test2015.json",
        }
        proposal_dets = 'refcoco+/proposal/res101_coco_minus_refer_notime_dets.json'
        proposal_masks = 'refcoco+/proposal/res101_coco_minus_refer_notime_masks.json'
        self.vg_proposal = ("vgbua_res101_precomputed",
                            "trainval2014_resnet101_faster_rcnn_genome")
        self.proposal_source = proposal_source
        self.boxes = boxes
        self.test_mode = test_mode
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_path = data_path
        self.root_path = root_path
        self.transform = transform
        self.image_sets = [iset.strip() for iset in image_set.split('+')]
        self.coco = COCO(annotation_file=os.path.join(
            data_path, coco_annot_files['train2014']))
        self.refer = REFER(data_path, dataset='refcoco+', splitBy='unc')
        self.refer_ids = []
        for iset in self.image_sets:
            self.refer_ids.extend(self.refer.getRefIds(split=iset))
        self.refs = self.refer.loadRefs(ref_ids=self.refer_ids)
        if 'proposal' in boxes:
            with open(os.path.join(data_path, proposal_dets), 'r') as f:
                proposal_list = json.load(f)
            self.proposals = {}
            for proposal in proposal_list:
                image_id = proposal['image_id']
                if image_id in self.proposals:
                    self.proposals[image_id].append(proposal['box'])
                else:
                    self.proposals[image_id] = [proposal['box']]
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.mask_size = mask_size
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_annotations()
        if self.aspect_grouping:
            self.group_ids = self.group_aspect(self.database)
Exemplo n.º 25
0
    def __init__(self,
                 root_path=None,
                 image_set='train',
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 mask_size=(14, 14),
                 aspect_grouping=False,
                 **kwargs):
        """
        Visual Question Answering Dataset

        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param mask_size: size of instance mask of each object
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(CLS3, self).__init__()
        cache_dir = False
        assert not cache_mode, 'currently not support cache mode!'

        categories = [
            '__background__', 'person', 'bicycle', 'car', 'motorcycle',
            'airplane', 'bus', 'train', 'truck', 'boat', 'trafficlight',
            'firehydrant', 'stopsign', 'parkingmeter', 'bench', 'bird', 'cat',
            'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
            'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
            'frisbee', 'skis', 'snowboard', 'sportsball', 'kite',
            'baseballbat', 'baseballglove', 'skateboard', 'surfboard',
            'tennisracket', 'bottle', 'wineglass', 'cup', 'fork', 'knife',
            'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
            'broccoli', 'carrot', 'hotdog', 'pizza', 'donut', 'cake', 'chair',
            'couch', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tv',
            'laptop', 'mouse', 'remote', 'keyboard', 'cellphone', 'microwave',
            'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
            'scissors', 'teddybear', 'hairdrier', 'toothbrush'
        ]
        self.category_to_idx = {c: i for i, c in enumerate(categories)}
        self.data_split = image_set  # HACK: reuse old parameter

        self.periodStrip = re.compile("(?!<=\d)(\.)(?!\d)")
        self.commaStrip = re.compile("(\d)(\,)(\d)")
        self.punct = [
            ';', r"/", '[', ']', '"', '{', '}', '(', ')', '=', '+', '\\', '_',
            '-', '>', '<', '@', '`', ',', '?', '!'
        ]

        self.test_mode = test_mode

        self.root_path = root_path

        self.box_bank = {}

        self.transform = transform
        self.zip_mode = zip_mode

        self.aspect_grouping = aspect_grouping
        self.add_image_as_a_box = add_image_as_a_box

        self.cache_dir = os.path.join(root_path, 'cache')
        # return_offsets_mapping
        model_name = 'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name
        self.fast_tokenizer = AutoTokenizer.from_pretrained(
            'bert-base-uncased',
            cache_dir=self.cache_dir,
            use_fast=True,
            return_offsets_mapping=True)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            model_name,
            cache_dir=self.cache_dir)
        self.max_txt_token = 128

        if zip_mode:
            self.zipreader = ZipReader()

        self.anno_aug = 'anno_aug' in kwargs
        self.database = self.load_annotations()
        self.use_img_box = True
        self.random_drop_tags = False
Exemplo n.º 26
0
    def __init__(self,
                 captions_set,
                 ann_file,
                 roi_set,
                 image_set,
                 root_path,
                 data_path,
                 small_version=False,
                 negative_sampling='hard',
                 phrase_cls=True,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 basic_tokenizer=None,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=True,
                 on_memory=False,
                 **kwargs):
        """
        Visual Grounded Paraphrase Dataset

        :param ann_file: annotation csv file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param kwargs:
        """
        super(VGPDataset, self).__init__()

        # temperarily enable cache mode and see if it works
        # assert not cache_mode, 'currently not support cache mode!'

        self.data_path = data_path
        self.root_path = root_path
        self.captions_set = os.path.join(data_path, captions_set)
        self.ann_file = os.path.join(data_path, ann_file)
        self.roi_set = os.path.join(data_path, roi_set)
        self.image_set = os.path.join(self.data_path, image_set)
        self.small = small_version
        self.neg_sampling = negative_sampling
        self.phrase_cls = phrase_cls
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        self.on_memory = False  # mode True doesn't work
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.basic_tokenizer = basic_tokenizer if basic_tokenizer is not None \
            else BasicTokenizer(do_lower_case=True)
        if tokenizer is None:
            if pretrained_model_name is None:
                pretrained_model_name = 'bert-base-uncased'
            if 'roberta' in pretrained_model_name:
                tokenizer = RobertaTokenizer.from_pretrained(
                    pretrained_model_name)
            else:
                tokenizer = BertTokenizer.from_pretrained(
                    pretrained_model_name)
        self.tokenizer = tokenizer

        if zip_mode:
            self.zipreader = ZipReader()

        self.database = self.load_captions(self.captions_set)
    def __init__(self, config):

        super(ResNetVLBERT, self).__init__(config)

        self.enable_cnn_reg_loss = config.NETWORK.ENABLE_CNN_REG_LOSS
        if not config.NETWORK.BLIND:
            self.image_feature_extractor = FastRCNN(config,
                                                    average_pool=True,
                                                    final_dim=config.NETWORK.IMAGE_FINAL_DIM,
                                                    enable_cnn_reg_loss=self.enable_cnn_reg_loss)
            if config.NETWORK.VLBERT.object_word_embed_mode == 1:
                self.object_linguistic_embeddings = nn.Embedding(81, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 2:
                self.object_linguistic_embeddings = nn.Embedding(1, config.NETWORK.VLBERT.hidden_size)
            elif config.NETWORK.VLBERT.object_word_embed_mode == 3:
                self.object_linguistic_embeddings = None
            else:
                raise NotImplementedError
        self.image_feature_bn_eval = config.NETWORK.IMAGE_FROZEN_BN

        self.tokenizer = BertTokenizer.from_pretrained(config.NETWORK.BERT_MODEL_NAME)

        language_pretrained_model_path = None
        if config.NETWORK.BERT_PRETRAINED != '':
            language_pretrained_model_path = '{}-{:04d}.model'.format(config.NETWORK.BERT_PRETRAINED,
                                                                      config.NETWORK.BERT_PRETRAINED_EPOCH)
        elif os.path.isdir(config.NETWORK.BERT_MODEL_NAME):
            weight_path = os.path.join(config.NETWORK.BERT_MODEL_NAME, BERT_WEIGHTS_NAME)
            if os.path.isfile(weight_path):
                language_pretrained_model_path = weight_path
        self.language_pretrained_model_path = language_pretrained_model_path
        if language_pretrained_model_path is None:
            print("Warning: no pretrained language model found, training from scratch!!!")

        # Also pass the finetuning strategy
        self.vlbert = VisualLinguisticBert(config.NETWORK.VLBERT,
                                         language_pretrained_model_path=language_pretrained_model_path, finetune_strategy=config.FINETUNE_STRATEGY)

        # self.hm_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size)
        # self.hi_out = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.NETWORK.VLBERT.hidden_size)

        dim = config.NETWORK.VLBERT.hidden_size
        if config.NETWORK.CLASSIFIER_TYPE == "2fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                torch.nn.Linear(dim, config.NETWORK.CLASSIFIER_HIDDEN_SIZE),
                torch.nn.ReLU(inplace=True),
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                torch.nn.Linear(config.NETWORK.CLASSIFIER_HIDDEN_SIZE, config.DATASET.ANSWER_VOCAB_SIZE),
            )
        elif config.NETWORK.CLASSIFIER_TYPE == "1fc":
            self.final_mlp = torch.nn.Sequential(
                torch.nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                torch.nn.Linear(dim, config.DATASET.ANSWER_VOCAB_SIZE)
            )
        elif config.NETWORK.CLASSIFIER_TYPE == 'mlm':
            transform = BertPredictionHeadTransform(config.NETWORK.VLBERT)
            linear = nn.Linear(config.NETWORK.VLBERT.hidden_size, config.DATASET.ANSWER_VOCAB_SIZE)
            self.final_mlp = nn.Sequential(
                transform,
                nn.Dropout(config.NETWORK.CLASSIFIER_DROPOUT, inplace=False),
                linear
            )
        else:
            raise ValueError("Not support classifier type: {}!".format(config.NETWORK.CLASSIFIER_TYPE))

        # init weights
        self.init_weight()

        self.fix_params()
Exemplo n.º 28
0
    def __init__(self,
                 ann_file,
                 image_set,
                 root_path,
                 data_path,
                 seq_len=64,
                 with_precomputed_visual_feat=False,
                 mask_raw_pixels=True,
                 with_rel_task=True,
                 with_mlm_task=True,
                 with_mvrc_task=True,
                 transform=None,
                 test_mode=False,
                 zip_mode=False,
                 cache_mode=False,
                 cache_db=False,
                 ignore_db_cache=True,
                 tokenizer=None,
                 pretrained_model_name=None,
                 add_image_as_a_box=False,
                 aspect_grouping=False,
                 **kwargs):
        """
        Conceptual Captions Dataset

        :param ann_file: annotation jsonl file
        :param image_set: image folder name, e.g., 'vcr1images'
        :param root_path: root path to cache database loaded from annotation file
        :param data_path: path to vcr dataset
        :param transform: transform
        :param test_mode: test mode means no labels available
        :param zip_mode: reading images and metadata in zip archive
        :param cache_mode: cache whole dataset to RAM first, then __getitem__ read them from RAM
        :param ignore_db_cache: ignore previous cached database, reload it from annotation file
        :param tokenizer: default is BertTokenizer from pytorch_pretrained_bert
        :param add_image_as_a_box: add whole image as a box
        :param aspect_grouping: whether to group images via their aspect
        :param kwargs:
        """
        super(COCOCaptionsDataset, self).__init__()

        assert not cache_mode, 'currently not support cache mode!'
        assert not test_mode

        annot = {
            'train': 'annotations/captions_train2017.json',
            'val': 'annotations/captions_val2017.json'
        }
        annot_inst = {
            'train': 'annotations/instances_train2017.json',
            'val': 'annotations/instances_val2017.json'
        }
        if zip_mode:
            self.root = os.path.join(data_path,
                                     '{0}2017.zip@/{0}2017'.format(image_set))
        else:
            self.root = os.path.join(data_path, '{}2017'.format(image_set))

        self.seq_len = seq_len
        self.with_rel_task = with_rel_task
        self.with_mlm_task = with_mlm_task
        self.with_mvrc_task = with_mvrc_task
        self.data_path = data_path
        self.root_path = root_path
        self.ann_file = os.path.join(data_path, annot[image_set])
        self.ann_file_inst = os.path.join(data_path, annot_inst[image_set])
        self.with_precomputed_visual_feat = with_precomputed_visual_feat
        self.mask_raw_pixels = mask_raw_pixels
        self.image_set = image_set
        self.transform = transform
        self.test_mode = test_mode
        self.zip_mode = zip_mode
        self.cache_mode = cache_mode
        self.cache_db = cache_db
        self.ignore_db_cache = ignore_db_cache
        self.aspect_grouping = aspect_grouping
        self.cache_dir = os.path.join(root_path, 'cache')
        self.add_image_as_a_box = add_image_as_a_box
        if not os.path.exists(self.cache_dir):
            makedirsExist(self.cache_dir)
        self.tokenizer = tokenizer if tokenizer is not None \
            else BertTokenizer.from_pretrained(
            'bert-base-uncased' if pretrained_model_name is None else pretrained_model_name,
            cache_dir=self.cache_dir)

        if self.zip_mode:
            self.zipreader = ZipReader()

        self.coco = COCO(self.ann_file)
        self.coco_inst = COCO(self.ann_file_inst)
        self.ids = list(sorted(self.coco.imgs.keys()))
        # filter images without detection annotations
        self.ids = [
            img_id for img_id in self.ids
            if len(self.coco_inst.getAnnIds(imgIds=img_id, iscrowd=None)) > 0
        ]

        self.json_category_id_to_contiguous_id = {
            v: i + 1
            for i, v in enumerate(self.coco_inst.getCatIds())
        }
        self.contiguous_category_id_to_json_id = {
            v: k
            for k, v in self.json_category_id_to_contiguous_id.items()
        }
        self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}

        if self.aspect_grouping:
            assert False, "not support aspect grouping currently!"
            # self.group_ids = self.group_aspect(self.database)

        print('mask_raw_pixels: ', self.mask_raw_pixels)