示例#1
0
    def __getitem_detector__(self, index):
        #print("Getting image bounding boxes using pre-trained detector")
        item_key = self.item_keys[index]
        item = self.items[item_key]
        sample = {}

        image_file_name_0 = item['image1']
        image_file_name_1 = item['image2']

        masks_0 = self.get_image_masks_by_training_index(index, 0)
        masks_1 = self.get_image_masks_by_training_index(index, 1)

        image_0, sample_0 = self.__getimage_detector__(image_file_name_0,
                                                       masks_0)
        image_1, sample_1 = self.__getimage_detector__(image_file_name_1,
                                                       masks_1)

        image = torch.stack((image_0, image_1), dim=0)

        sample["boxes_0"] = ArrayTensorField(sample_0["boxes"])
        sample["boxes_1"] = ArrayTensorField(sample_1["boxes"])

        sample["objects_0"] = sample_0["objects"]
        sample["objects_1"] = sample_1["objects"]

        if item.get("label", None) is not None:
            sample["next_image_label"] = np.array(
                [1 if item["label"] == True else 0])
        else:
            sample["next_image_label"] = np.array([0])

        sample["next_image_label"] = IntArrayField(sample["next_image_label"])
        sample["is_random_next"] = sample["next_image_label"]

        return image, Instance(sample)
示例#2
0
    def __getimage__(self, image_file_path):
        sample = {}
        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        if '.npz' in image_file_path:
            image_file_path = os.path.splitext(image_file_path)[0]

        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        # Consider the entire image as a whole detected box

        boxes = np.array([window])
        obj_labels = [0]

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()
        """
        if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h:
            scale_w = (w - 1) / np.amax(boxes[:, 2])
            scale_h = (h - 1) / np.amax(boxes[:, 3])
            scale = min(scale_w, scale_h)
            boxes *= scale
        """

        sample["objects"] = IntArrayField(np.array(obj_labels))
        sample['boxes'] = torch.Tensor(boxes)

        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))

        return image, sample
示例#3
0
    def __getitem__(self, index):
        if self.image_feature_type == "r2c":
            return self.__getitem_detector__(index)

        item = self.items[index]
        sample = {}
        if not self.text_only:
            image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index(
                index)
            image_feat_variable = ArrayField(image_feat_variable)
            image_dim_variable = IntArrayField(np.array(image_dim_variable))
            sample["image_feat_variable"] = image_feat_variable
            sample["image_dim_variable"] = image_dim_variable
            sample["label"] = image_dim_variable
        else:
            sample["label"] = IntArrayField(np.array([0]))

        caption_a = item["caption"]
        imageID = item["image_id"]

        if self.expanded and index >= self.train_size:
            coco = self.coco_val
        else:
            coco = self.coco

        rest_anns = coco.loadAnns(
            [i for i in coco.getAnnIds(imgIds=imageID) if i != item['id']])

        if self.args.get("two_sentence", True):
            if random.random() > 0.5:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = rest_anns[random.randint(0, len(rest_anns) - 1)]
                flag = True

            caption_b = item_b["caption"]
            subword_tokens_a = self.tokenizer.tokenize(caption_a)
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_a,
                                        text_b=subword_tokens_b,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        elif not self.args.get("no_next_sentence", False):
            if random.random() < self.args.false_caption_ratio:
                item_b = self.items[random.randint(0, len(self.items) - 1)]
                while item_b["image_id"] == imageID:
                    item_b = self.items[random.randint(0, len(self.items) - 1)]
                flag = False
            else:
                item_b = item
                flag = True

            caption_b = item_b["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_b,
                                        text_b=None,
                                        is_correct=flag,
                                        max_seq_length=self.max_seq_length)
        else:
            caption_b = item["caption"]
            subword_tokens_b = self.tokenizer.tokenize(caption_b)
            bert_example = InputExample(unique_id=index,
                                        text_a=subword_tokens_b,
                                        text_b=None,
                                        is_correct=None,
                                        max_seq_length=self.max_seq_length)

        bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
            example=bert_example,
            tokenizer=self.tokenizer,
            probability=self.masked_lm_prob)
        bert_feature.insert_field_into_dict(sample)

        return Instance(sample)
示例#4
0
    def __getitem__(self, index):
        entry = self.entries[index]
        sentence = entry['sentence']
        e_pos = entry['entity_indices']
        e_num = entry['entity_num']
        target = entry['target_indices']
        entity_ids = entry['entity_ids']
        entity_types = entry['entity_types']
        #v, b, p, e, n, a, idx, types

        if self.use_visual_genome:
            features = self.features[self.pos_boxes[entry['image']][0]:self.pos_boxes[entry['image']][1], :]
            spatials = self.spatials[self.pos_boxes[entry['image']][0]:self.pos_boxes[entry['image']][1], :]
        else:
            image_id = entry["image"]
            features, cls_boxes, max_conf, image_h, image_w = self.features_chunk[image_id]

        if self.add_spatial_features:
            features = np.concatenate((features, spatials), axis=1)
        else:
            spatials = None

        sample = {}

        image_feat_variable = ArrayField(features)
        image_dim_variable = IntArrayField(np.array(len(features)))
        sample["image_feat_variable"] = image_feat_variable
        sample["image_dim_variable"] = image_dim_variable

        tokenized_sentence, alignment = retokenize_with_alignment(sentence.split(" "), self.tokenizer)

        e_pos_after_subword = []
        current_index = 0
        for position in e_pos:
            for index, i in enumerate(alignment):
                if i == position:
                    if index == len(alignment) - 1 or alignment[index+1] != i:
                        e_pos_after_subword.append(index + 1) # Because the added [CTX] token

        if len(e_pos_after_subword) != len(e_pos) or len(e_pos_after_subword) != len(target):
            assert(0)
        
        # Need to convert target into soft scores:
        target_len = features.shape[0]
        new_target = []
        for i in target:
            new_i = [0.0] * target_len
            if len(i) != 0:
                score = 1.0  / len(i)
                for j in i:
                    new_i[j] = score
            new_target.append(new_i)

        # target = entity_num x v_feature_size
        target = ArrayField(np.array(new_target, dtype="float"), padding_value = 0.0)

        original_position = IntArrayField(np.array(e_pos_after_subword, dtype="int"), padding_value = -1)
        sample["label"] = target # Remember that sometimes that label is empty for certain entities, that's because the boxes we provided do not have a match.
        sample["flickr_position"] = original_position


        bert_example = InputExample(unique_id = -1, text_a = tokenized_sentence, text_b = None, is_correct = None, max_seq_length = self.max_seq_length)

        if self.pretraining:
            bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
                        example = bert_example,
                        tokenizer=self.tokenizer,
                        probability = self.masked_lm_prob)
            bert_feature.insert_field_into_dict(sample)
        else:
            bert_feature = InputFeatures.convert_one_example_to_features(
                    example = bert_example,
                    tokenizer=self.tokenizer)
            bert_feature.insert_field_into_dict(sample)

        return Instance(sample)
示例#5
0
    def __getitem__(self, index):

        iminfo = self.items[index]
        image_feat_variable, image_boxes, image_dim_variable = self.get_image_features_by_training_index(
            index)

        sample = {}

        image_feat_variable = ArrayField(image_feat_variable)
        image_dim_variable = IntArrayField(np.array(image_dim_variable))
        sample["image_feat_variable"] = image_feat_variable
        sample["image_dim_variable"] = image_dim_variable

        answer = None
        valid_answers_idx = np.zeros((10), np.int32)
        valid_answers_idx.fill(-1)
        answer_scores = np.zeros(self.answer_dict.num_vocab, np.float32)

        if 'answer' in iminfo:
            answer = iminfo['answer']
        elif 'valid_answers' in iminfo:
            valid_answers = iminfo['valid_answers']
            answer = np.random.choice(valid_answers)
            valid_answers_idx[:len(valid_answers)] = ([
                self.answer_dict.word2idx(ans) for ans in valid_answers
            ])
            ans_idx = ([
                self.answer_dict.word2idx(ans) for ans in valid_answers
            ])
            answer_scores = (compute_answer_scores(ans_idx,
                                                   self.answer_dict.num_vocab,
                                                   self.answer_dict.UNK_idx))
        if answer is not None:
            answer_idx = self.answer_dict.word2idx(answer)

        if self.advanced_vqa:
            new_answer = self.tokenized_list[self.answer_dict.word2idx(answer)]
            subword_tokens = self.tokenizer.tokenize(" ".join(
                iminfo['question_tokens']))
            subword_tokens = ["[CLS]"] + subword_tokens + [
                "?"
            ]  # We will use the last word to do predictio

            masked_lm_labels = [-1] * len(subword_tokens)

            for i in new_answer:
                subword_tokens.append("[MASK]")
                masked_lm_labels.append(self.tokenizer.vocab[i])
            subword_tokens.append("[SEP]")
            masked_lm_labels.append(-1)

            input_ids = []
            for i in subword_tokens:
                input_ids.append(self.tokenizer.vocab[i])

            bert_feature = InputFeatures(unique_id=-1,
                                         tokens=subword_tokens,
                                         input_ids=input_ids,
                                         input_mask=[1] * len(input_ids),
                                         input_type_ids=[0] * len(input_ids),
                                         is_correct=1,
                                         lm_label_ids=masked_lm_labels)
            bert_feature.insert_field_into_dict(sample)
        else:
            if self.pretraining:
                item = iminfo
                if self.no_next_sentence:
                    answer = answer
                    label = None
                    subword_tokens_a = self.tokenizer.tokenize(" ".join(
                        item['question_tokens'])) + ["?"]
                    subword_tokens_b = self.tokenizer.tokenize(
                        " ".join(answer))

                    bert_example = InputExample(
                        unique_id=index,
                        text_a=subword_tokens_a + subword_tokens_b,
                        text_b=None,
                        is_correct=None,
                        max_seq_length=self.max_seq_length)
                    bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
                        example=bert_example,
                        tokenizer=self.tokenizer,
                        probability=0.15)
                else:
                    assert (0)  # Should not use this part
                    '''if random.random() > self.false_caption_ratio:
                        answer = answer
                        label = 1
                    else:
                        while(True):
                            wrong_answer = np.random.choice(self.answer_dict.word_list)
                            if wrong_answer not in valid_answers:
                                wrong_answer = answer
                                label = 0
                                break
                    subword_tokens_a = self.tokenizer.tokenize(" ".join(item['question_tokens'])) + ["?"] 
                    subword_tokens_b = self.tokenizer.tokenize(" ".join(answer))
                    bert_example = InputExample(unique_id = index, text_a = subword_tokens_a, text_b = subword_tokens_b, is_correct = label, max_seq_length = self.max_seq_length)
                    bert_feature = InputFeatures.convert_one_example_to_features_pretraining(
                                example = bert_example,
                                tokenizer=self.tokenizer,
                                probability = 0.15)'''
                bert_feature.insert_field_into_dict(sample)
            else:
                item = iminfo
                subword_tokens = self.tokenizer.tokenize(" ".join(
                    item['question_tokens']))
                if self.no_next_sentence:
                    subword_tokens = subword_tokens + [
                        "?", "[MASK]"
                    ]  # We will use the last word to do predictio
                    subwords_b = None
                else:
                    subword_tokens = subword_tokens + ["?"]
                    subwords_b = ["[MASK]"]
                bert_example = InputExample(unique_id=-1,
                                            text_a=subword_tokens,
                                            text_b=subwords_b,
                                            max_seq_length=self.max_seq_length)
                bert_feature = InputFeatures.convert_one_example_to_features(
                    bert_example, tokenizer=self.tokenizer)
                bert_feature.insert_field_into_dict(sample)

        if answer is not None:
            sample['label'] = ArrayField(np.array(answer_scores))

        return Instance(sample)
示例#6
0
    def __getimage_detector__(self, image_file_path, metadata):
        sample = {}
        ###################################################################
        # Most of things adapted from VCR
        # Load image now and rescale it. Might have to subtract the mean and whatnot here too.
        if '.npz' in image_file_path:
            image_file_path = os.path.splitext(image_file_path)[0]

        image = load_image(image_file_path)
        image, window, img_scale, padding = resize_image(
            image, random_pad=self.is_train)
        image = to_tensor_and_normalize(image)
        c, h, w = image.shape
        ###################################################################
        # We will use all detections
        dets2use = np.arange(len(metadata['cls_boxes']))
        # [nobj, 14, 14]
        #segms = np.stack([make_mask(mask_size=14, box=metadata['cls_boxes'][i],
        #                            polygons_list=metadata['segms'][i]) for i in dets2use])

        boxes = np.array(metadata['cls_boxes'])
        # Possibly rescale them if necessary
        boxes /= img_scale

        boxes[:, :2] += np.array(padding[:2])[None]
        boxes[:, 2:] += np.array(padding[:2])[None]
        """
        try:
            metadata['names'] = [i.split(" ")[1][1:-1] for i in metadata["names"]]
        except:
            pass
        obj_labels = [self.coco_obj_to_ind[metadata['names'][i]] for i in dets2use.tolist()]
        """

        obj_labels = metadata['objects']
        keep_boxes = np.where(obj_labels > 0)

        boxes = boxes[keep_boxes]
        obj_labels = [0] + list(obj_labels[keep_boxes])
        obj_labels = [int(a) for a in obj_labels]

        boxes = np.row_stack((window, boxes))
        #segms = np.concatenate((np.ones((1, 14, 14), dtype=np.float32), segms), 0)

        #sample['segms'] = ArrayField(segms, padding_value=0)
        #sample['objects'] = ListField([LabelField(x, skip_indexing=True) for x in obj_labels])
        sample["objects"] = IntArrayField(np.array(obj_labels))

        if not np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2])):
            import ipdb
            ipdb.set_trace()

        if np.amax(boxes[:, 2]) >= w or np.amax(boxes[:, 3]) >= h:
            scale_w = (w - 1) / np.amax(boxes[:, 2])
            scale_h = (h - 1) / np.amax(boxes[:, 3])
            scale = min(scale_w, scale_h)
            boxes *= scale
            #print(np.amax(boxes[:, 2]), w)
            #print(np.amax(boxes[:, 3]), h)

        assert np.all((boxes[:, 1] >= 0.) & (boxes[:, 1] < boxes[:, 3]))
        assert np.all((boxes[:, 0] >= 0.) & (boxes[:, 0] < boxes[:, 2]))
        assert np.all((boxes[:, 2] <= w))
        assert np.all((boxes[:, 3] <= h))
        sample['boxes'] = torch.Tensor(boxes)

        return image, sample