示例#1
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        img_id = self.img_ids[index]
        instance_dict = {}

        image = load_image(self.img_id_2_image_folder[img_id])
        image, window, img_scale, padding = resize_image(image,
                                                         random_pad=False)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        # print (self.img_id_2_folder[img_id])
        with open(self.img_id_2_meta_folder[img_id], 'r') as f:
            metadata = json.load(f)

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[:, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]

        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))

        # if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
        # import ipdb
        # ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        if int(img_id.split('-')[-1]) == 53716:
            print('find')
        return image, instance, int(img_id.split('-')[-1])
示例#2
0
    def __getimage__(self, image_file_path):
        sample = {}
        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        if '.npz' in image_file_path:
            image_file_path = os.path.splitext(image_file_path)[0]

        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        # Consider the entire image as a whole detected box

        boxes = np.array([window])
        obj_labels = [0]

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        """
        if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h:
            scale_w = (w - 1) / np.amax(boxes[:, 2])
            scale_h = (h - 1) / np.amax(boxes[:, 3])
            scale = min(scale_w, scale_h)
            boxes *= scale
        """

        sample["objects"] = IntArrayField(np.array(obj_labels))
        sample['boxes'] = torch.Tensor(boxes)

        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))

        return image, sample
示例#3
0
文件: vcr.py 项目: uclanlp/visualbert
    def __getitem__(self, index):

        if self.complete_shuffle:
            if self.pretraining_include_qa_and_qar:
                index = index // 8
                which = index % 8
            else:
                index = index // 4
                which = index % 4
        else:
            which = None

        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers

        answer_choices = item['{}_choices'.format(self.mode)]

        if self.complete_shuffle and which < 4:
            only_use_answer = True
        else:
            only_use_answer = False

        if self.complete_shuffle and which >= 4:
            only_use_qar = True
        else:
            only_use_qar = False

        dets2use, old_det_to_new_ind = self._get_dets_to_use(
            item, only_use_answer=only_use_answer, only_use_qar=only_use_qar)

        # The only_use_qar is ambigious...

        instance_dict = {}
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(self.vcr_image_dir, item['img_fn']))
        #image = self.imagedatas(item['img_fn'])

        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(self.vcr_image_dir, item['metadata_fn']),
                  'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        examples = data_iter_item(
            item,
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            endingonly=False,
            include_qar=self.pretraining_include_qa_and_qar,
            only_qar=self.only_qar)
        self.getitem_bert_part(examples, item, instance_dict, which)

        if self.use_alignment:  # Alignment between objects and text
            ######################
            examples_alginment_pack = []
            for i in range(len(examples)):
                if self.pretraining_include_qa_and_qar:
                    if i < 4:
                        raw_text_a = item["question"]
                        raw_text_b = item['answer_choices'][i]
                    else:
                        raw_text_a = item["question"] + item['answer_choices'][
                            item['answer_label']]
                        raw_text_b = item['rationale_choices'][i - 4]
                elif self.only_qar:
                    raw_text_a = item["question"] + item['answer_choices'][item[
                        'answer_label']]  # This is the correct alignment right now.
                    raw_text_b = item['rationale_choices'][i]
                else:
                    raw_text_a = item["question"]
                    raw_text_b = item['answer_choices'][i]

                true_text_a = examples[i][0].text_a
                true_text_b = examples[i][0].text_b
                text_alignment_a = examples[i][1]
                text_alignment_b = examples[i][2]

                examples_alginment_pack.append(
                    (raw_text_a, raw_text_b, true_text_a, true_text_b,
                     text_alignment_a, text_alignment_b))

            image_box_position = []

            if which is not None:
                raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b = examples_alginment_pack[
                    which]
                box_record = defaultdict(list)
                self.get_alignment_original(raw_text_a,
                                            text_alignment_a,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1)
                self.get_alignment_original(raw_text_b,
                                            text_alignment_b,
                                            old_det_to_new_ind,
                                            box_record,
                                            offset=1 + len(text_alignment_a) +
                                            1)
                image_text_alignment = ListField([
                    IntArrayField(np.array(box_record[i]), padding_value=-1)
                    for i in range(len(boxes))
                ])
            else:
                for raw_text_a, raw_text_b, true_text_a, true_text_b, text_alignment_a, text_alignment_b in examples_alginment_pack:

                    box_record = defaultdict(list)
                    self.get_alignment_original(raw_text_a,
                                                text_alignment_a,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1)
                    self.get_alignment_original(raw_text_b,
                                                text_alignment_b,
                                                old_det_to_new_ind,
                                                box_record,
                                                offset=1 +
                                                len(text_alignment_a) + 1)

                    image_box_position.append(
                        ListField([
                            IntArrayField(np.array(box_record[i]),
                                          padding_value=-1)
                            for i in range(len(boxes))
                        ]))

                image_text_alignment = ListField(image_box_position)
            ######################

            instance_dict["image_text_alignment"] = image_text_alignment

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
示例#4
0
    def __getitem_detector__(self, index):
        item = self.items[index]
        sample = {}
        if self.expanded and index >= self.train_size:
            image_file_name = "COCO_val2014_{:0>12d}.jpg".format(
                item['image_id'])
        else:
            image_file_name = "COCO_{}2014_{:0>12d}.jpg".format(
                self.split_name, item['image_id'])

        image_info = self.masks[image_file_name]
        if "train" in image_file_name:
            image_file_path = os.path.join(self.data_root, "train2014",
                                           image_file_name)
        elif "val" in image_file_name:
            image_file_path = os.path.join(self.data_root, "val2014",
                                           image_file_name)

        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        metadata = self.masks[image_file_name]  # Get the metadata
        # Load boxes.
        # We will use all detections
        dets2use = np.arange(len(metadata['boxes']))
        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]

        try:
            metadata['names'] = [
                i.split(" ")[1][1:-1] for i in metadata["names"]
            ]
        except:
            pass
        obj_labels = [
            self.coco_obj_to_ind[metadata['names'][i]]
            for i in dets2use.tolist()
        ]

        boxes = np.row_stack((window, boxes))
        segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms),
                               0)
        obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        sample['segms'] = ArrayField(segms, padding_value=0)
        sample['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        sample['boxes'] = ArrayField(boxes, padding_value=-1)

        caption_a = item["caption"]
        imageID = item["image_id"]

        sample["label"] = sample[
            'objects']  # This is an useless field. Just so that they know the batch size.

        if self.expanded and index >= self.train_size:
            coco = self.coco_val
        else:
            coco = self.coco

        rest_anns = coco.loadAnns(
            [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']])

        if self.args.get("two_sentence", True):
            if random.random() > 0.5:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = rest_anns[random.randint(0, len(rest_anns) - 1)]
                flag = True  # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_a,
                                        text_b=subword_tokens_b,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        elif not self.args.get("no_next_sentence", False):
            if random.random() < self.args.false_caption_ratio:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = item
                flag = True  # is next sentence

            caption_b = item_b["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_b,
                                        text_b=None,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        else:
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_a,
                                        text_b=None,
                                        is_correct=None,
                                        max_seq_length=self.max_seq_length)

        bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
            example=bert_example,
            tokenizer=self.tokenizer,
            probability=self.masked_lm_prob)
        bert_feature.insert_field_into_dict(sample)

        return image, Instance(sample)
示例#5
0
    def __getitem__(self, index):
        # if self.split == 'test':
        #     raise ValueError("blind test mode not supported quite yet")
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            conditioned_label = item[
                'answer_label'] if self.split != 'test' else self.conditioned_answer_choice
            item['question'] += item['answer_choices'][conditioned_label]

        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        # grp_items = {k: np.array(v, dtype=np.float16) for k, v in self.get_h5_group(index).items()}
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        # Essentially we need to condition on the right answer choice here, if we're doing QA->R. We will always
        # condition on the `conditioned_answer_choice.`
        condition_key = self.conditioned_answer_choice if self.split == "test" and self.mode == "rationale" else ""

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(
                    item['question'],
                    grp_items[f'ctx_{self.mode}{condition_key}{i}'],
                    old_det_to_new_ind,
                    item['objects'],
                    token_indexers=self.token_indexers,
                    pad_ind=0 if self.add_image_as_a_box else -1)
                for i in range(4)
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(
                answer,
                grp_items[f'answer_{self.mode}{condition_key}{i}'],
                old_det_to_new_ind,
                item['objects'],
                token_indexers=self.token_indexers,
                pad_ind=0 if self.add_image_as_a_box else -1)
            for i, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)

        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number'],
            'img_id':
            item['img_id']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]

        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))

        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        instance.index_fields(self.vocab)
        return image, instance
示例#6
0
    def __getitem__(self, index):
        item = deepcopy(self.items[index])

        ###################################################################
        # Load questions and answers
        if self.mode == 'rationale':
            item['question'] += item['answer_choices'][item['answer_label']]
        elif self.mode == 'joint':
            item['joint_choices'] = [a + r for a in item['answer_choices'] \
                                            for r in item['rationale_choices']]
            if self.split != 'test':
                item['joint_label'] = item['answer_label'] * 4 + item[
                    'rationale_label']
        answer_choices = item['{}_choices'.format(self.mode)]
        dets2use, old_det_to_new_ind = self._get_dets_to_use(item)

        ###################################################################
        # Load in BERT. We'll get contextual representations of the context and the answer choices
        with h5py.File(self.h5fn, 'r') as h5:
            grp_items = {
                k: np.array(v, dtype=np.float16)
                for k, v in h5[str(index)].items()
            }

        omcs_items = None
        if self.h5fn_omcs is not None:
            with h5py.File(self.h5fn_omcs, 'r') as h5_omcs:
                omcs_items = {
                    k: np.array(v, dtype=np.float16)
                    for k, v in h5_omcs[str(index)].items()
                }

        if self.all_answers_for_rationale:
            # Keys in h5 file are in format [ctx|answer]_rationale[i][j].
            # Pick i based on the answer_label set.
            assert self.mode == 'rationale'
            answer_label = item['answer_label']
            key = f'{self.mode}{answer_label}'
        else:
            # Keys are in format [ctx|answer]_mode[j]
            key = f'{self.mode}'

        instance_dict = {}
        if 'endingonly' not in self.embs_to_load:
            if omcs_items is None:
                ctx_embs = [
                    grp_items[f'ctx_{key}{j}']
                    for j in range(len(answer_choices))
                ]
            else:
                ctx_embs = [
                    np.hstack([
                        grp_items[f'ctx_{key}{j}'], omcs_items[f'ctx_{key}{j}']
                    ]) for j in range(len(answer_choices))
                ]
            questions_tokenized, question_tags = zip(*[
                _fix_tokenization(item['question'],
                                  ctx_embs[j],
                                  old_det_to_new_ind,
                                  item['objects'],
                                  token_indexers=self.token_indexers,
                                  pad_ind=0 if self.add_image_as_a_box else -1)
                for j in range(len(answer_choices))
            ])
            instance_dict['question'] = ListField(questions_tokenized)
            instance_dict['question_tags'] = ListField(question_tags)

        if omcs_items is None:
            answer_embs = [
                grp_items[f'answer_{key}{j}']
                for j in range(len(answer_choices))
            ]
        else:
            answer_embs = [
                np.hstack([
                    grp_items[f'answer_{key}{j}'],
                    omcs_items[f'answer_{key}{j}']
                ]) for j in range(len(answer_choices))
            ]
        answers_tokenized, answer_tags = zip(*[
            _fix_tokenization(answer,
                              answer_embs[j],
                              old_det_to_new_ind,
                              item['objects'],
                              token_indexers=self.token_indexers,
                              pad_ind=0 if self.add_image_as_a_box else -1)
            for j, answer in enumerate(answer_choices)
        ])

        instance_dict['answers'] = ListField(answers_tokenized)
        instance_dict['answer_tags'] = ListField(answer_tags)
        if self.split != 'test':
            instance_dict['label'] = LabelField(item['{}_label'.format(
                self.mode)],
                                                skip_indexing=True)
        instance_dict['metadata'] = MetadataField({
            'annot_id':
            item['annot_id'],
            'ind':
            index,
            'movie':
            item['movie'],
            'img_fn':
            item['img_fn'],
            'question_number':
            item['question_number']
        })

        ###################################################################
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        image = load_image(os.path.join(VCR_IMAGES_DIR, item['img_fn']))
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape

        ###################################################################
        # Load boxes.
        with open(os.path.join(VCR_IMAGES_DIR, item['metadata_fn']), 'r') as f:
            metadata = json.load(f)

        # [nobj, 14, 14]
        segms = np.stack([
            make_mask(mask_size=14,
                      box=metadata['boxes'][i],
                      polygons_list=metadata['segms'][i]) for i in dets2use
        ])

        # Chop off the final dimension, that's the confidence
        boxes = np.array(metadata['boxes'])[dets2use, :-1]
        # Possibly rescale them if necessary
        boxes *= img_scale
        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        obj_labels = [
            self.coco_obj_to_ind[item['objects'][i]]
            for i in dets2use.tolist()
        ]
        if self.add_image_as_a_box:
            boxes = np.row_stack((window, boxes))
            segms = np.concatenate((np.ones(
                (1, 14, 14), dtype=np.float32), segms), 0)
            obj_labels = [self.coco_obj_to_ind['__background__']] + obj_labels

        instance_dict['segms'] = ArrayField(segms, padding_value=0)
        instance_dict['objects'] = ListField(
            [LabelField(x, skip_indexing=True) for x in obj_labels])

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        instance_dict['boxes'] = ArrayField(boxes, padding_value=-1)

        instance = Instance(instance_dict)
        # instance.index_fields(self.vocab)
        return image, instance
示例#7
0
    def __getimage_detector__(self, image_file_path, metadata):
        sample = {}
        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        if '.npz' in image_file_path:
            image_file_path = os.path.splitext(image_file_path)[0]

        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        # We will use all detections
        dets2use = np.arange(len(metadata['cls_boxes']))
        # [nobj, 14, 14]
        #segms = np.stack([make_mask(mask_size=14, box=metadata['cls_boxes'][i],
        #                            polygons_list=metadata['segms'][i]) for i in dets2use])

        boxes = np.array(metadata['cls_boxes'])
        # Possibly rescale them if necessary
        boxes /= img_scale

        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        """
        try:
            metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]]
        except:
            pass
        obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()]
        """

        obj_labels = metadata['objects']
        keep_boxes = np.where(obj_labels > 0)

        boxes = boxes[keep_boxes]
        obj_labels = [0] + list(obj_labels[keep_boxes])
        obj_labels = [int(a) for a in obj_labels]

        boxes = np.row_stack((window, boxes))
        #segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)

        #sample['segms'] = ArrayField(segms, padding_value=0)
        #sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])
        sample["objects"] = IntArrayField(np.array(obj_labels))

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()

        if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h:
            scale_w = (w - 1) / np.amax(boxes[:, 2])
            scale_h = (h - 1) / np.amax(boxes[:, 3])
            scale = min(scale_w, scale_h)
            boxes *= scale
            #print(np.amax(boxes[:, 2]), w)
            #print(np.amax(boxes[:, 3]), h)

        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        sample['boxes'] = torch.Tensor(boxes)

        return image, sample