def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") img_id = self.img_ids[index] instance_dict = {} image = load_image(self.img_id_2_image_folder[img_id]) image, window, img_scale, padding = resize_image(image, random_pad=False) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. # print (self.img_id_2_folder[img_id]) with open(self.img_id_2_meta_folder[img_id], 'r') as f: metadata = json.load(f) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[:, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): # import ipdb # ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) if int(img_id.split('-')[-1]) == 53716: print('find') return image, instance, int(img_id.split('-')[-1])
def __getimage__(self, image_file_path): sample = {} ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. if '.npz' in image_file_path: image_file_path = os.path.splitext(image_file_path)[0] image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Consider the entire image as a whole detected box boxes = np.array([window]) obj_labels = [0] if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() """ if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h: scale_w = (w - 1) / np.amax(boxes[:, 2]) scale_h = (h - 1) / np.amax(boxes[:, 3]) scale = min(scale_w, scale_h) boxes *= scale """ sample["objects"] = IntArrayField(np.array(obj_labels)) sample['boxes'] = torch.Tensor(boxes) assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) return image, sample
def __getitem__(self, index): if self.complete_shuffle: if self.pretraining_include_qa_and_qar: index = index // 8 which = index % 8 else: index = index // 4 which = index % 4 else: which = None item = deepcopy(self.items[index]) ################################################################### # Load questions and answers answer_choices = item['{}_choices'.format(self.mode)] if self.complete_shuffle and which < 4: only_use_answer = True else: only_use_answer = False if self.complete_shuffle and which >= 4: only_use_qar = True else: only_use_qar = False dets2use, old_det_to_new_ind = self._get_dets_to_use( item, only_use_answer=only_use_answer, only_use_qar=only_use_qar) # The only_use_qar is ambigious... instance_dict = {} if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(self.vcr_image_dir, item['img_fn'])) #image = self.imagedatas(item['img_fn']) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(self.vcr_image_dir, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels examples = data_iter_item( item, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, endingonly=False, include_qar=self.pretraining_include_qa_and_qar, only_qar=self.only_qar) self.getitem_bert_part(examples, item, instance_dict, which) if self.use_alignment: # Alignment between objects and text ###################### examples_alginment_pack = [] for i in range(len(examples)): if self.pretraining_include_qa_and_qar: if i < 4: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] else: raw_text_a = item["question"] + item['answer_choices'][ item['answer_label']] raw_text_b = item['rationale_choices'][i - 4] elif self.only_qar: raw_text_a = item["question"] + item['answer_choices'][item[ 'answer_label']] # This is the correct alignment right now. raw_text_b = item['rationale_choices'][i] else: raw_text_a = item["question"] raw_text_b = item['answer_choices'][i] true_text_a = examples[i][0].text_a true_text_b = examples[i][0].text_b text_alignment_a = examples[i][1] text_alignment_b = examples[i][2] examples_alginment_pack.append( (raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b)) image_box_position = [] if which is not None: raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[ which] box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_text_alignment = ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ]) else: for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack: box_record = defaultdict(list) self.get_alignment_original(raw_text_a, text_alignment_a, old_det_to_new_ind, box_record, offset=1) self.get_alignment_original(raw_text_b, text_alignment_b, old_det_to_new_ind, box_record, offset=1 + len(text_alignment_a) + 1) image_box_position.append( ListField([ IntArrayField(np.array(box_record[i]), padding_value=-1) for i in range(len(boxes)) ])) image_text_alignment = ListField(image_box_position) ###################### instance_dict["image_text_alignment"] = image_text_alignment instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def __getitem_detector__(self, index): item = self.items[index] sample = {} if self.expanded and index >= self.train_size: image_file_name = "COCO_val2014_{:0>12d}.jpg".format( item['image_id']) else: image_file_name = "COCO_{}2014_{:0>12d}.jpg".format( self.split_name, item['image_id']) image_info = self.masks[image_file_name] if "train" in image_file_name: image_file_path = os.path.join(self.data_root, "train2014", image_file_name) elif "val" in image_file_name: image_file_path = os.path.join(self.data_root, "val2014", image_file_name) ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### metadata = self.masks[image_file_name] # Get the metadata # Load boxes. # We will use all detections dets2use = np.arange(len(metadata['boxes'])) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] try: metadata['names'] = [ i.split(" ")[1][1:-1] for i in metadata["names"] ] except: pass obj_labels = [ self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist() ] boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels sample['segms'] = ArrayField(segms, padding_value=0) sample['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) sample['boxes'] = ArrayField(boxes, padding_value=-1) caption_a = item["caption"] imageID = item["image_id"] sample["label"] = sample[ 'objects'] # This is an useless field. Just so that they know the batch size. if self.expanded and index >= self.train_size: coco = self.coco_val else: coco = self.coco rest_anns = coco.loadAnns( [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']]) if self.args.get("two_sentence", True): if random.random() > 0.5: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = rest_anns[random.randint(0, len(rest_anns) - 1)] flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_a = self.tokenizer.tokenize(caption_a) subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_a, text_b=subword_tokens_b, is_correct=flag, max_seq_length=self.max_seq_length) elif not self.args.get("no_next_sentence", False): if random.random() < self.args.false_caption_ratio: item_b = self.items[random.randint(0, len(self.items) - 1)] while item_b["image_id"] == imageID: item_b = self.items[random.randint(0, len(self.items) - 1)] flag = False else: item_b = item flag = True # is next sentence caption_b = item_b["caption"] subword_tokens_b = self.tokenizer.tokenize(caption_b) bert_example = InputExample(unique_id=index, text_a=subword_tokens_b, text_b=None, is_correct=flag, max_seq_length=self.max_seq_length) else: subword_tokens_a = self.tokenizer.tokenize(caption_a) bert_example = InputExample(unique_id=index, text_a=subword_tokens_a, text_b=None, is_correct=None, max_seq_length=self.max_seq_length) bert_feature = InputFeatures.convert_one_example_to_features_pretraining( example=bert_example, tokenizer=self.tokenizer, probability=self.masked_lm_prob) bert_feature.insert_field_into_dict(sample) return image, Instance(sample)
def __getitem__(self, index): # if self.split == 'test': # raise ValueError("blind test mode not supported quite yet") item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': conditioned_label = item[ 'answer_label'] if self.split != 'test' else self.conditioned_answer_choice item['question'] += item['answer_choices'][conditioned_label] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()} with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always # condition on the `conditioned_answer_choice.` condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else "" instance_dict = {} if 'endingonly' not in self.embs_to_load: questions_tokenized, question_tags = zip(*[ _fix_tokenization( item['question'], grp_items[f'ctx_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i in range(4) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) answers_tokenized, answer_tags = zip(*[ _fix_tokenization( answer, grp_items[f'answer_{self.mode}{condition_key}{i}'], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for i, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'], 'img_id': item['img_id'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) instance.index_fields(self.vocab) return image, instance
def __getitem__(self, index): item = deepcopy(self.items[index]) ################################################################### # Load questions and answers if self.mode == 'rationale': item['question'] += item['answer_choices'][item['answer_label']] elif self.mode == 'joint': item['joint_choices'] = [a + r for a in item['answer_choices'] \ for r in item['rationale_choices']] if self.split != 'test': item['joint_label'] = item['answer_label'] * 4 + item[ 'rationale_label'] answer_choices = item['{}_choices'.format(self.mode)] dets2use, old_det_to_new_ind = self._get_dets_to_use(item) ################################################################### # Load in BERT. We'll get contextual representations of the context and the answer choices with h5py.File(self.h5fn, 'r') as h5: grp_items = { k: np.array(v, dtype=np.float16) for k, v in h5[str(index)].items() } omcs_items = None if self.h5fn_omcs is not None: with h5py.File(self.h5fn_omcs, 'r') as h5_omcs: omcs_items = { k: np.array(v, dtype=np.float16) for k, v in h5_omcs[str(index)].items() } if self.all_answers_for_rationale: # Keys in h5 file are in format [ctx|answer]_rationale[i][j]. # Pick i based on the answer_label set. assert self.mode == 'rationale' answer_label = item['answer_label'] key = f'{self.mode}{answer_label}' else: # Keys are in format [ctx|answer]_mode[j] key = f'{self.mode}' instance_dict = {} if 'endingonly' not in self.embs_to_load: if omcs_items is None: ctx_embs = [ grp_items[f'ctx_{key}{j}'] for j in range(len(answer_choices)) ] else: ctx_embs = [ np.hstack([ grp_items[f'ctx_{key}{j}'], omcs_items[f'ctx_{key}{j}'] ]) for j in range(len(answer_choices)) ] questions_tokenized, question_tags = zip(*[ _fix_tokenization(item['question'], ctx_embs[j], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for j in range(len(answer_choices)) ]) instance_dict['question'] = ListField(questions_tokenized) instance_dict['question_tags'] = ListField(question_tags) if omcs_items is None: answer_embs = [ grp_items[f'answer_{key}{j}'] for j in range(len(answer_choices)) ] else: answer_embs = [ np.hstack([ grp_items[f'answer_{key}{j}'], omcs_items[f'answer_{key}{j}'] ]) for j in range(len(answer_choices)) ] answers_tokenized, answer_tags = zip(*[ _fix_tokenization(answer, answer_embs[j], old_det_to_new_ind, item['objects'], token_indexers=self.token_indexers, pad_ind=0 if self.add_image_as_a_box else -1) for j, answer in enumerate(answer_choices) ]) instance_dict['answers'] = ListField(answers_tokenized) instance_dict['answer_tags'] = ListField(answer_tags) if self.split != 'test': instance_dict['label'] = LabelField(item['{}_label'.format( self.mode)], skip_indexing=True) instance_dict['metadata'] = MetadataField({ 'annot_id': item['annot_id'], 'ind': index, 'movie': item['movie'], 'img_fn': item['img_fn'], 'question_number': item['question_number'] }) ################################################################### # Load image now and rescale it. Might have to subtract the mean and whatnot here too. image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn'])) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # Load boxes. with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f: metadata = json.load(f) # [nobj, 14, 14] segms = np.stack([ make_mask(mask_size=14, box=metadata['boxes'][i], polygons_list=metadata['segms'][i]) for i in dets2use ]) # Chop off the final dimension, that's the confidence boxes = np.array(metadata['boxes'])[dets2use, :-1] # Possibly rescale them if necessary boxes *= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] obj_labels = [ self.coco_obj_to_ind[item['objects'][i]] for i in dets2use.tolist() ] if self.add_image_as_a_box: boxes = np.row_stack((window, boxes)) segms = np.concatenate((np.ones( (1, 14, 14), dtype=np.float32), segms), 0) obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels instance_dict['segms'] = ArrayField(segms, padding_value=0) instance_dict['objects'] = ListField( [LabelField(x, skip_indexing=True) for x in obj_labels]) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) instance_dict['boxes'] = ArrayField(boxes, padding_value=-1) instance = Instance(instance_dict) # instance.index_fields(self.vocab) return image, instance
def __getimage_detector__(self, image_file_path, metadata): sample = {} ################################################################### # Most of things adapted from VCR # Load image now and rescale it. Might have to subtract the mean and whatnot here too. if '.npz' in image_file_path: image_file_path = os.path.splitext(image_file_path)[0] image = load_image(image_file_path) image, window, img_scale, padding = resize_image( image, random_pad=self.is_train) image = to_tensor_and_normalize(image) c, h, w = image.shape ################################################################### # We will use all detections dets2use = np.arange(len(metadata['cls_boxes'])) # [nobj, 14, 14] #segms = np.stack([make_mask(mask_size=14, box=metadata['cls_boxes'][i], # polygons_list=metadata['segms'][i]) for i in dets2use]) boxes = np.array(metadata['cls_boxes']) # Possibly rescale them if necessary boxes /= img_scale boxes[:, :2] += np.array(padding[:2])[None] boxes[:, 2:] += np.array(padding[:2])[None] """ try: metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]] except: pass obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()] """ obj_labels = metadata['objects'] keep_boxes = np.where(obj_labels > 0) boxes = boxes[keep_boxes] obj_labels = [0] + list(obj_labels[keep_boxes]) obj_labels = [int(a) for a in obj_labels] boxes = np.row_stack((window, boxes)) #segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0) #sample['segms'] = ArrayField(segms, padding_value=0) #sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels]) sample["objects"] = IntArrayField(np.array(obj_labels)) if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])): import ipdb ipdb.set_trace() if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h: scale_w = (w - 1) / np.amax(boxes[:, 2]) scale_h = (h - 1) / np.amax(boxes[:, 3]) scale = min(scale_w, scale_h) boxes *= scale #print(np.amax(boxes[:, 2]), w) #print(np.amax(boxes[:, 3]), h) assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3])) assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])) assert np.all((boxes[:, 2] <= w)) assert np.all((boxes[:, 3] <= h)) sample['boxes'] = torch.Tensor(boxes) return image, sample