Exemplo n.º 1
0
    def add_answer_info(self, sample_info, sample):
        sample_has_caption = "caption_str" in sample_info
        if sample_has_caption:
            sample_info["answers"] = [sample_info["caption_str"]]

        sample = super().add_answer_info(sample_info, sample)

        if sample_has_caption:
            sample.caption_str = object_to_byte_tensor(sample_info["caption_str"])
            sample.ref_strs = object_to_byte_tensor(sample_info["reference_strs"])
            sample.pop("answers")

        return sample
Exemplo n.º 2
0
    def load_item(self, idx):
        sample_info = self.annotation_db[idx]
        sample_info = self.preprocess_sample_info(sample_info)
        current_sample = Sample()

        if self._dataset_type != "test":
            text_processor_argument = {"tokens": sample_info["caption_tokens"]}
            processed_caption = self.text_processor(text_processor_argument)
            current_sample.text = processed_caption["text"]
            current_sample.caption_id = torch.tensor(sample_info["caption_id"],
                                                     dtype=torch.int)
            current_sample.caption_len = torch.tensor(len(
                sample_info["caption_tokens"]),
                                                      dtype=torch.int)

        if isinstance(sample_info["image_id"], int):
            current_sample.image_id = torch.tensor(sample_info["image_id"],
                                                   dtype=torch.int)
        else:
            current_sample.image_id = object_to_byte_tensor(
                sample_info["image_id"])

        if self._use_features is True:
            features = self.features_db[idx]
            current_sample.update(features)

        # Add reference captions to sample
        current_sample = self.add_reference_caption(sample_info,
                                                    current_sample)

        return current_sample
Exemplo n.º 3
0
    def __getitem__(self, idx):
        img, target = self.coco_dataset[idx]
        image_id = self.coco_dataset.ids[idx]
        target = {"image_id": image_id, "annotations": target}
        img, target = self._load_coco_annotations(
            img, target, load_attributes=self.config.load_attributes)
        transform_out = self.detection_image_and_target_processor({
            "img":
            img,
            "target":
            target,
            "dataset_type":
            self._dataset_type
        })
        img = transform_out["img"]
        target = transform_out["target"]

        current_sample = Sample()
        current_sample.image_id = torch.tensor(image_id, dtype=torch.long)
        current_sample.image = img
        current_sample.targets_enc = object_to_byte_tensor(
            target, max_size=self.config.max_target_enc_size)
        current_sample.orig_size = target["orig_size"].clone().detach()

        return current_sample
Exemplo n.º 4
0
    def __getitem__(self, idx):
        if len(self.video_clips) == 0:
            self.load_df()
        video, audio, info = self.video_clips.get_clip(idx)
        text = self.text_list[idx]
        actual_idx = self.ids_list[idx]
        label = [
            self.class_to_idx[class_name] for class_name in self.labels[idx]
        ]
        one_hot_label = torch.zeros(len(self.class_to_idx))
        one_hot_label[label] = 1

        if self.video_processor is not None:
            video = self.video_processor(video)

        if self.audio_processor is not None:
            audio = self.audio_processor(audio)

        sample = Sample()
        sample.id = object_to_byte_tensor(actual_idx)
        sample.video = video
        sample.audio = audio
        sample.update(self.text_processor({"text": text}))
        sample.targets = one_hot_label
        return sample
Exemplo n.º 5
0
    def load_item(self, idx):
        sample_info = self.annotation_db[idx]
        sample_info = self.preprocess_sample_info(sample_info)
        current_sample = Sample()

        if self._dataset_type != "test":
            text_processor_argument = {"tokens": sample_info["caption_tokens"]}
            processed_caption = self.text_processor(text_processor_argument)
            current_sample.text = processed_caption["text"]
            current_sample.caption_id = torch.tensor(sample_info["caption_id"],
                                                     dtype=torch.int)
            current_sample.caption_len = torch.tensor(len(
                sample_info["caption_tokens"]),
                                                      dtype=torch.int)

        current_sample.image_id = object_to_byte_tensor(
            sample_info["image_id"])

        if self._use_features:
            features = self.features_db[idx]
            current_sample.update(features)
        else:
            image_path = str(sample_info["image_name"]) + ".jpg"
            current_sample.image = self.image_db.from_path(
                image_path)["images"][0]

        # Add reference captions to sample
        current_sample = self.add_reference_caption(sample_info,
                                                    current_sample)

        return current_sample
Exemplo n.º 6
0
    def __getitem__(self, idx: int) -> Sample:
        sample_info = self.annotation_db[idx]
        current_sample = Sample()

        # Get the image features
        if self._use_features:
            features = self.features_db[idx]
            image_info_0 = features["image_info_0"]
            if image_info_0 and "image_id" in image_info_0.keys():
                image_info_0["feature_path"] = image_info_0["image_id"]
                image_info_0.pop("image_id")
            self.transformer_bbox_processor(features["image_info_0"])
            current_sample.update(features)
        elif self._use_images:
            import ipdb

            ipdb.set_trace()
            image_id = sample_info["image_id"]
            dataset = sample_info["dataset_id"]
            if "mscoco" in dataset:
                image_id = image_id.rjust(12, "0")

            assert (len(self.image_db.from_path(image_id)["images"]) !=
                    0), f"image id: {image_id} not found"
            current_sample.image = self.image_db.from_path(
                image_id)["images"][0]
        # breakpoint()

        processed_caption = self.caption_processor({
            "timed_caption":
            sample_info["timed_caption"],
            "bbox_attend_scores":
            image_info_0["bbox_attend_scores"],
        })
        # should be a trace enhanced processor
        current_sample.update(processed_caption)
        # print(processed_caption.get("sync_reverse",False))
        processed_traces = self.trace_bbox_processor(
            image_info_0,
            sample_info,
            processed_caption.get("sync_reverse", False),
            processed_caption.get("sync_shuffle_order", None),
        )
        current_sample.update(processed_traces)
        current_sample.image_id = object_to_byte_tensor(
            sample_info["image_id"])
        current_sample.feature_path = sample_info["feature_path"]
        import ipdb

        ipdb.set_trace()

        return current_sample
Exemplo n.º 7
0
    def add_answer_info(self, sample_info, sample):
        # Load real answers from sample_info
        answers = sample_info.get("answers", [])
        answer_processor_arg = {"answers": answers}

        answer_processor_arg["tokens"] = sample.pop("ocr_tokens", [])

        processed_answers = self.answer_processor(answer_processor_arg)

        assert not self.config.fast_read, (
            "In TextVQADataset, online OCR sampling is incompatible "
            "with fast_read, so fast_read is currently not supported.")

        sample.update(processed_answers)
        sample.answers = object_to_byte_tensor(answers)

        if "answers_scores" in sample:
            sample.targets = sample.pop("answers_scores")

        return sample
Exemplo n.º 8
0
    def add_sample_details(self, sample_info, sample):
        sample.image_id = object_to_byte_tensor(sample.image_id)

        # 1. Load text (question words)
        question_str = (sample_info["question"] if "question" in sample_info
                        else sample_info["question_str"])
        text_processor_args = {"text": question_str}

        if "question_tokens" in sample_info:
            text_processor_args["tokens"] = sample_info["question_tokens"]

        processed_question = self.text_processor(text_processor_args)

        if "input_ids" in processed_question:
            sample.text = processed_question["input_ids"]
            sample.text_len = torch.tensor(len(processed_question["tokens"]),
                                           dtype=torch.long)
        else:
            # For GLoVe based processors
            sample.text = processed_question["text"]
            sample.text_len = processed_question["length"]

        # 2. Load object
        # object bounding box information
        if "obj_normalized_boxes" in sample_info and hasattr(
                self, "copy_processor"):
            sample.obj_bbox_coordinates = self.copy_processor(
                {"blob": sample_info["obj_normalized_boxes"]})["blob"]

        # 3. Load OCR
        if not self.use_ocr:
            # remove all OCRs from the sample
            # (i.e. make an empty OCR list)
            sample_info["ocr_tokens"] = []
            sample_info["ocr_info"] = []
            if "ocr_normalized_boxes" in sample_info:
                sample_info["ocr_normalized_boxes"] = np.zeros((0, 4),
                                                               np.float32)
            # clear OCR visual features
            if "image_feature_1" in sample:
                sample.image_feature_1 = torch.zeros_like(
                    sample.image_feature_1)
            return sample

        # Preprocess OCR tokens
        if hasattr(self, "ocr_token_processor"):
            ocr_tokens = [
                self.ocr_token_processor({"text": token})["text"]
                for token in sample_info["ocr_tokens"]
            ]
        else:
            ocr_tokens = sample_info["ocr_tokens"]
        # Get FastText embeddings for OCR tokens
        context = self.context_processor({"tokens": ocr_tokens})
        sample.context = context["text"]
        sample.ocr_tokens = context["tokens"]

        sample.context_tokens = object_to_byte_tensor(context["tokens"])
        sample.context_feature_0 = context["text"]
        sample.context_info_0 = Sample()
        sample.context_info_0.max_features = context["length"]

        # Get PHOC embeddings for OCR tokens
        if hasattr(self, "phoc_processor"):
            context_phoc = self.phoc_processor({"tokens": ocr_tokens})
            sample.context_feature_1 = context_phoc["text"]
            sample.context_info_1 = Sample()
            sample.context_info_1.max_features = context_phoc["length"]

        # OCR order vectors
        if self.config.get("use_order_vectors", False):
            order_vectors = np.eye(len(sample.ocr_tokens), dtype=np.float32)
            order_vectors = torch.from_numpy(order_vectors)
            order_vectors[context["length"]:] = 0
            sample.order_vectors = order_vectors

        # OCR bounding box information
        if "ocr_normalized_boxes" in sample_info and hasattr(
                self, "copy_processor"):
            # New imdb format: OCR bounding boxes are already pre-computed
            max_len = self.config.processors.answer_processor.params.max_length
            sample.ocr_bbox_coordinates = self.copy_processor(
                {"blob":
                 sample_info["ocr_normalized_boxes"]})["blob"][:max_len]
        elif self.use_ocr_info and "ocr_info" in sample_info:
            # Old imdb format: OCR bounding boxes are computed on-the-fly
            # from ocr_info
            sample.ocr_bbox_coordinates = self.bbox_processor(
                {"info": sample_info["ocr_info"]})["bbox"].coordinates

        return sample
 def test_object_byte_tensor_conversion(self):
     test_obj = [1, "2", {3: 4}, [5]]
     test_obj_bytes = distributed.object_to_byte_tensor(test_obj)
     test_obj_dec = distributed.byte_tensor_to_object(test_obj_bytes)
     self.assertEqual(test_obj_dec, test_obj)