def __getitem__(self, index): if self.complete_shuffle: if self.pretraining_include_qa_and_qar: index = index // 8 which = index % 8 else: index = index // 4 which = index % 4 else: which = None item = deepcopy(self.items[index]) ################################################################### # Load questions and answers answer_choices = item['{}_choices'.format(self.mode)] if self.complete_shuffle and which < 4: only_use_answer = True else: only_use_answer = False if self.complete_shuffle and which >= 4: only_use_qar = True else: only_use_qar = False dets2use, old_det_to_new_ind = self._get_dets_to_use( item, only_use_answer=only_use_answer, only_use_qar=only_use_qar) # The only_use_qar is ambigious... instance_dict = {} if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(self.vcr_image_dir, item['img_fn'])) #image = self.imagedatas(item['img_fn']) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(self.vcr_image_dir, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels examples = data_iter_item( item, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, endingonly=False, include_qar=self.pretraining_include_qa_and_qar, only_qar=self.only_qar) self.getitem_bert_part(examples, item, instance_dict, which) if self.use_alignment: # Alignment between objects and text ###################### examples_alginment_pack = [] for i in range(len(examples)): if self.pretraining_include_qa_and_qar: if i < 4: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] else: raw_text_a = item["question"] + item['answer_choices'][ item['answer_label']] raw_text_b = item['rationale_choices'][i - 4] elif self.only_qar: raw_text_a = item["question"] + item['answer_choices'][item[ 'answer_label']] # This is the correct alignment right now. raw_text_b = item['rationale_choices'][i] else: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] true_text_a = examples[i][0].text_a true_text_b = examples[i][0].text_b text_alignment_a = examples[i][1] text_alignment_b = examples[i][2] examples_alginment_pack.append( (raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b)) image_box_position = [] if which is not None: raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[ which] box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_text_alignment = ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ]) else: for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack: box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_box_position.append( ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ])) image_text_alignment = ListField(image_box_position) ###################### instance_dict["image_text_alignment"] = image_text_alignment instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def __getitem__(self, index): event_inference_example = torch.tensor(self.examples[index]) labels = torch.tensor(self.labels[index]) record = self.records[index] if not self.include_image: return event_inference_example, labels ####### # Compute Image Features. Adapted from https://github.com/rowanz/r2c/blob/master/dataloaders/vcg.py ####### ################################################################### # Load boxes and their features. with open(os.path.join(VCR_IMAGES_DIR, record['metadata_fn']), 'r') as f: metadata = json.load(f) dets2use, old_det_to_new_ind, subjects = self.get_dets_to_use(record) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # Chop off the final dimension, that's the confidence img_fn = record['img_fn'] id = img_fn[img_fn.rfind('/')+1:img_fn.rfind('.')] with open(os.path.join(VCR_FEATURES_DIR,id)+'.pkl','rb') as p: features_dict = pickle.load(p) features = features_dict['object_features'][dets2use] boxes = np.array(metadata['boxes'])[dets2use, :-1] # create id labels to help ground person in the image objects = metadata['names'] obj_labels = [self.coco_obj_to_ind[objects[i]] for i in dets2use.tolist()] person_ids = [0] * len(obj_labels) for i in range(len(person_ids)): if obj_labels[i] == 1: p_id = int(dets2use[i])+1 # add 1 for person ids because it starts with 1 person_ids[i] = self.tokenizer.convert_tokens_to_ids(['<|det%d|>' % p_id])[0] subject_ids = [int(dets2use[i] in subjects) for i in range(len(obj_labels))] # add the image in the first visual sequence if self.add_image_as_a_box: w = metadata['width'] h = metadata['height'] features = np.row_stack((features_dict['image_features'], features)) boxes = np.row_stack((np.array([0, 0, w, h]), boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels person_ids = [self.tokenizer.convert_tokens_to_ids(['<|det0|>'])[0]] + person_ids subject_ids = [0] + subject_ids if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) if not np.all((boxes[:, 2] <= w)): boxes[:,2] = np.clip(boxes[:,2],None,w) if not np.all((boxes[:, 3] <= h)): boxes[:, 3] = np.clip(boxes[:, 3], None, h) padded_features, padded_boxes, padded_obj_labels, padded_segments, box_masks = \ _to_boxes_and_masks(features, boxes, obj_labels, segms, self.num_max_boxes) person_ids = _pad_ids(person_ids, self.num_max_boxes) subject_ids = _pad_ids(subject_ids, self.num_max_boxes) features = torch.Tensor(padded_features) boxes = torch.Tensor(padded_boxes) boxes_mask = torch.LongTensor(box_masks) objects = torch.LongTensor(padded_obj_labels) segments = torch.Tensor(padded_segments) person_ids = torch.LongTensor(person_ids) subject_ids = torch.LongTensor(subject_ids) return event_inference_example, labels, features, boxes, boxes_mask, objects, segments, person_ids, subject_ids
def __getitem_detector__(self, index): item = self.items[index] sample = {} if self.expanded and index >= self.train_size: image_file_name = "COCO_val2014_{:0>12d}.jpg".format( item['image_id']) else: image_file_name = "COCO_{}2014_{:0>12d}.jpg".format( self.split_name, item['image_id']) image_info = self.masks[image_file_name] if "train" in image_file_name: image_file_path = os.path.join(self.data_root, "train2014", image_file_name) elif "val" in image_file_name: image_file_path = os.path.join(self.data_root, "val2014", image_file_name) ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### metadata = self.masks[image_file_name] # Get the metadata # Load boxes. # We will use all detections dets2use = np.arange(len(metadata['boxes'])) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] try: metadata['names'] = [ i.split(" ")[1][1:-1] for i in metadata["names"] ] except: pass obj_labels = [ self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist() ] boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels sample['segms'] = ArrayField(segms, padding_value=0) sample['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) sample['boxes'] = ArrayField(boxes, padding_value=-1) caption_a = item["caption"] imageID = item["image_id"] sample["label"] = sample[ 'objects'] # This is an useless field. Just so that they know the batch size. if self.expanded and index >= self.train_size: coco = self.coco_val else: coco = self.coco rest_anns = coco.loadAnns( [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']]) if self.args.get("two_sentence", True): if random.random() > 0.5: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = rest_anns[random.randint(0, len(rest_anns) - 1)] flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_a = self.tokenizer.tokenize(caption_a) subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_a, text_b=subword_tokens_b, is_correct=flag, max_seq_length=self.max_seq_length) elif not self.args.get("no_next_sentence", False): if random.random() < self.args.false_caption_ratio: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = item flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_b, text_b=None, is_correct=flag, max_seq_length=self.max_seq_length) else: subword_tokens_a = self.tokenizer.tokenize(caption_a) bert_example = InputExample(unique_id=index, text_a=subword_tokens_a, text_b=None, is_correct=None, max_seq_length=self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example=bert_example, tokenizer=self.tokenizer, probability=self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) return image, Instance(sample)
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[ _fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i in range(4) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[ _fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'], 'img_id': item['img_id'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def __getitem__(self, index): item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': item['question'] += item['answer_choices'][item['answer_label']] elif self.mode == 'joint': item['joint_choices'] = [a + r for a in item['answer_choices'] \ for r in item['rationale_choices']] if self.split != 'test': item['joint_label'] = item['answer_label'] * 4 + item[ 'rationale_label'] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } omcs_items = None if self.h5fn_omcs is not None: with h5py.File(self.h5fn_omcs, 'r') as h5_omcs: omcs_items = { k: np.array(v, dtype=np.float16) for k, v in h5_omcs[str(index)].items() } if self.all_answers_for_rationale: # Keys in h5 file are in format [ctx|answer]_rationale[i][j]. # Pick i based on the answer_label set. assert self.mode == 'rationale' answer_label = item['answer_label'] key = f'{self.mode}{answer_label}' else: # Keys are in format [ctx|answer]_mode[j] key = f'{self.mode}' instance_dict = {} if 'endingonly' not in self.embs_to_load: if omcs_items is None: ctx_embs = [ grp_items[f'ctx_{key}{j}'] for j in range(len(answer_choices)) ] else: ctx_embs = [ np.hstack([ grp_items[f'ctx_{key}{j}'], omcs_items[f'ctx_{key}{j}'] ]) for j in range(len(answer_choices)) ] questions_tokenized, question_tags = zip(*[ _fix_tokenization(item['question'], ctx_embs[j], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for j in range(len(answer_choices)) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) if omcs_items is None: answer_embs = [ grp_items[f'answer_{key}{j}'] for j in range(len(answer_choices)) ] else: answer_embs = [ np.hstack([ grp_items[f'answer_{key}{j}'], omcs_items[f'answer_{key}{j}'] ]) for j in range(len(answer_choices)) ] answers_tokenized, answer_tags = zip(*[ _fix_tokenization(answer, answer_embs[j], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for j, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) # instance.index_fields(self.vocab) return image, instance
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item['answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = {k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items()} # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[_fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i in range(4)]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[_fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1 ) for i, answer in enumerate(answer_choices)]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format(self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number']}) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image_id = self.path2id[item['img_fn']] image_id_gt = self.path2id_gt[item['img_fn']] features, num_boxes, boxes, _ = self._image_features_reader[image_id] boxes = boxes[:num_boxes] features = features[:num_boxes] gt_features, gt_num_boxes, gt_boxes, _ = self._gt_image_features_reader[image_id_gt] features[0] = (features[0] * num_boxes + gt_features[0] * gt_num_boxes) / (num_boxes + gt_num_boxes) # merge two boxes, and assign the labels. gt_boxes = gt_boxes[1:gt_num_boxes] gt_features = gt_features[1:gt_num_boxes] gt_num_boxes = gt_num_boxes - 1 gt_box_preserve = min(self._max_region_num - 1, gt_num_boxes) gt_boxes = gt_boxes[:gt_box_preserve] gt_features = gt_features[:gt_box_preserve] gt_num_boxes = gt_box_preserve num_box_preserve = min(self._max_region_num - int(gt_num_boxes), int(num_boxes)) boxes = boxes[:num_box_preserve] features = features[:num_box_preserve] # concatenate the boxes mix_boxes = np.concatenate((boxes, gt_boxes), axis=0) mix_features = np.concatenate((features, gt_features), axis=0) mix_num_boxes = num_box_preserve + int(gt_num_boxes) image_mask = [1] * (mix_num_boxes) while len(image_mask) < self._max_region_num: image_mask.append(0) # mix_boxes_pad = np.zeros((self._max_region_num, 5)) mix_features_pad = np.zeros((self._max_region_num, 3072)) # mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes] mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes] # appending the target feature. features = torch.tensor(mix_features_pad).float() image_mask = torch.tensor(image_mask).long() # spatials = torch.tensor(mix_boxes).float() spatials = mix_boxes_pad # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use]) # # Chop off the final dimension, that's the confidence # boxes = np.array(metadata['boxes'])[dets2use, :-1] # # Possibly rescale them if necessary # boxes *= img_scale # boxes[:, :2] += np.array(padding[:2])[None] # boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist()] # if self.add_image_as_a_box: # boxes = np.row_stack((window, boxes)) # segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) # obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): # import ipdb # ipdb.set_trace() # assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) # assert np.all((boxes[:, 2] <= w)) # assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(spatials, padding_value=-1) instance_dict['box_mask'] = ArrayField(image_mask, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return features, instance