def load_item(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() processed_sentence = self.text_processor( {"text": sample_info["sentence"]}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_features is True: # Remove sentence id from end identifier = "-".join(sample_info["identifier"].split("-")[:-1]) # Load img0 and img1 features sample_info["feature_path"] = "{}-img0.npy".format(identifier) features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.img0 = Sample() current_sample.img0.update(features) sample_info["feature_path"] = "{}-img1.npy".format(identifier) features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.img1 = Sample() current_sample.img1.update(features) is_correct = 1 if sample_info["label"] == "True" else 0 current_sample.targets = torch.tensor(is_correct, dtype=torch.long) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() if self._use_features: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) if self.config.get("use_image_feature_masks", False): current_sample.update({ "image_labels": self.masked_region_processor(features["image_feature_0"]) }) current_sample.update(features) else: image_path = str(sample_info["image_name"]) + ".jpg" current_sample.image = self.image_db.from_path( image_path)["images"][0] current_sample = self._add_masked_question(sample_info, current_sample) if self._add_answer: current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def __getitem__(self, idx: int) -> Sample: sample_info = self.annotation_db[idx] current_sample = Sample() processed_caption = self.masked_token_processor({ "text_a": sample_info["caption"], "text_b": "", "is_correct": True }) current_sample.update(processed_caption) current_sample.image_id = sample_info["image_id"] current_sample.feature_path = sample_info["feature_path"] # Get the image features if self._use_features: features = self.features_db[idx] image_info_0 = features["image_info_0"] if image_info_0 and "image_id" in image_info_0.keys(): image_info_0["feature_path"] = image_info_0["image_id"] image_info_0.pop("image_id") current_sample.update(features) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] sample_info = self.preprocess_sample_info(sample_info) current_sample = Sample() # breaking change from VQA2Dataset: load question_id current_sample.question_id = torch.tensor( sample_info["question_id"], dtype=torch.int ) if isinstance(sample_info["image_id"], int): current_sample.image_id = str(sample_info["image_id"]) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample = self.add_sample_details(sample_info, current_sample) current_sample = self.add_answer_info(sample_info, current_sample) # only the 'max_features' key is needed # pop other keys to minimize data loading overhead if hasattr(current_sample, "image_info_0"): for k in list(current_sample.image_info_0): if k != "max_features": current_sample.image_info_0.pop(k) if hasattr(current_sample, "image_info_1"): for k in list(current_sample.image_info_1): if k != "max_features": current_sample.image_info_1.pop(k) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] sample_info = self._preprocess_answer(sample_info) sample_info["question_id"] = sample_info["id"] current_sample = Sample() if self._use_features is True: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) if self.config.get("use_image_feature_masks", False): current_sample.update({ "image_labels": self.masked_region_processor(features["image_feature_0"]) }) current_sample.update(features) current_sample = self._add_masked_question(sample_info, current_sample) if self._add_answer: current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def load_item(self, idx): sample_info = self.annotation_db[idx] sample_info = self.preprocess_sample_info(sample_info) current_sample = Sample() if self._dataset_type != "test": text_processor_argument = {"tokens": sample_info["caption_tokens"]} processed_caption = self.text_processor(text_processor_argument) current_sample.text = processed_caption["text"] current_sample.caption_id = torch.tensor(sample_info["caption_id"], dtype=torch.int) current_sample.caption_len = torch.tensor(len( sample_info["caption_tokens"]), dtype=torch.int) current_sample.image_id = object_to_byte_tensor( sample_info["image_id"]) if self._use_features: features = self.features_db[idx] current_sample.update(features) else: image_path = str(sample_info["image_name"]) + ".jpg" current_sample.image = self.image_db.from_path( image_path)["images"][0] # Add reference captions to sample current_sample = self.add_reference_caption(sample_info, current_sample) return current_sample
def load_item(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() processed_sentence = self.text_processor( {"text": sample_info["sentence2"]}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_features is True: # Remove sentence id from end identifier = sample_info["Flikr30kID"].split(".")[0] # Load img0 and img1 features sample_info["feature_path"] = "{}.npy".format(identifier) features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.update(features) label = LABEL_TO_INT_MAPPING[sample_info["gold_label"]] current_sample.targets = torch.tensor(label, dtype=torch.long) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() text_processor_argument = {"text": sample_info["question_str"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "input_ids" in processed_question: current_sample.update(processed_question) current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.update(features) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] sample_info = self.preprocess_sample_info(sample_info) current_sample = Sample() processed_text = self.text_processor({"text": sample_info["text"]}) current_sample.text = processed_text["text"] if "input_ids" in processed_text: current_sample.update(processed_text) current_sample.id = torch.tensor(int(sample_info["id"]), dtype=torch.int) # Instead of using idx directly here, use sample_info to fetch # the features as feature_path has been dynamically added features = self.features_db.get(sample_info) if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.update(features) if "label" in sample_info: current_sample.targets = torch.tensor(sample_info["label"], dtype=torch.long) return current_sample
def load_item(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() if "question_tokens" in sample_info: text_processor_argument = { "tokens": sample_info["question_tokens"], "text": sample_info["question_str"], } else: text_processor_argument = {"text": sample_info["question"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "input_ids" in processed_question: current_sample.update(processed_question) current_sample.question_id = torch.tensor( sample_info["question_id"], dtype=torch.int ) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor( sample_info["image_id"], dtype=torch.int ) else: current_sample.image_id = sample_info["image_id"] if "question_tokens" in sample_info: current_sample.text_len = torch.tensor( len(sample_info["question_tokens"]), dtype=torch.int ) if self._use_features: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"] ) current_sample.update(features) else: image_path = sample_info["image_name"] + ".jpg" current_sample.image = self.image_db.from_path(image_path)["images"][0] # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def test_sample_working(self): initial = Sample() initial.x = 1 initial["y"] = 2 # Assert setter and getter self.assertEqual(initial.x, 1) self.assertEqual(initial["x"], 1) self.assertEqual(initial.y, 2) self.assertEqual(initial["y"], 2) update_dict = {"a": 3, "b": {"c": 4}} initial.update(update_dict) self.assertEqual(initial.a, 3) self.assertEqual(initial["a"], 3) self.assertEqual(initial.b.c, 4) self.assertEqual(initial["b"].c, 4)
def classify(self, image: ImageType, text: str): """Classifies a given image and text in it into Hateful/Non-Hateful. Image can be a url or a local path or you can directly pass a PIL.Image.Image object. Text needs to be a sentence containing all text in the image. >>> from VisualBERT.mmf.models.mmbt import MMBT >>> model = MMBT.from_pretrained("mmbt.hateful_memes.images") >>> model.classify("some_url", "some_text") {"label": 0, "confidence": 0.56} Args: image (ImageType): Image to be classified text (str): Text in the image Returns: bool: Whether image is hateful (1) or non hateful (0) """ if isinstance(image, str): if image.startswith("http"): temp_file = tempfile.NamedTemporaryFile() download(image, *os.path.split(temp_file.name), disable_tqdm=True) image = tv_helpers.default_loader(temp_file.name) temp_file.close() else: image = tv_helpers.default_loader(image) text = self.processor_dict["text_processor"]({"text": text}) image = self.processor_dict["image_processor"](image) sample = Sample() sample.text = text["text"] if "input_ids" in text: sample.update(text) sample.image = image sample_list = SampleList([sample]) device = next(self.model.parameters()).device sample_list = sample_list.to(device) output = self.model(sample_list) scores = nn.functional.softmax(output["scores"], dim=1) confidence, label = torch.max(scores, dim=1) return {"label": label.item(), "confidence": confidence.item()}
def __getitem__(self, idx: int) -> Type[Sample]: sample_info = self.annotation_db[idx] current_sample = Sample() processed_question = self.text_processor( {"text": sample_info["question"]}) current_sample.update(processed_question) current_sample.id = torch.tensor(int(sample_info["question_id"]), dtype=torch.int) image_path = self.get_image_path(sample_info["image_id"], sample_info["coco_split"]) current_sample.image = self.image_db.from_path(image_path)["images"][0] if "answers" in sample_info: answers = self.answer_processor( {"answers": sample_info["answers"]}) current_sample.targets = answers["answers_scores"] return current_sample
def __call__(self, item): texts = item["text"] if not isinstance(texts, list): texts = [texts] processed = [] for idx, text in enumerate(texts): sample = Sample() processed_text = self.tokenizer({"text": text}) sample.update(processed_text) sample.segment_ids.fill_(idx) processed.append(sample) # Use SampleList to convert list of tensors to stacked tensors processed = SampleList(processed) if self.fusion_strategy == "concat": processed.input_ids = processed.input_ids.view(-1) processed.input_mask = processed.input_mask.view(-1) processed.segment_ids = processed.segment_ids.view(-1) processed.lm_label_ids = processed.lm_label_ids.view(-1) return processed.to_dict()
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() plot = sample_info["plot"] if isinstance(plot, list): plot = plot[0] processed_sentence = self.text_processor({"text": plot}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_images is True: current_sample.image = self.image_db[idx]["images"][0] processed = self.answer_processor({"answers": sample_info["genres"]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() processed_text = self.text_processor({"text": sample_info["text"]}) current_sample.text = processed_text["text"] if "input_ids" in processed_text: current_sample.update(processed_text) current_sample.id = torch.tensor(int(sample_info["id"]), dtype=torch.int) # Get the first image from the set of images returned from the image_db current_sample.image = self.image_db[idx]["images"][0] if "label" in sample_info: current_sample.targets = torch.tensor(sample_info["label"], dtype=torch.long) return current_sample
def __getitem__(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() plot = sample_info["plot"] if isinstance(plot, list): plot = plot[0] processed_sentence = self.text_processor({"text": plot}) current_sample.text = processed_sentence["text"] if "input_ids" in processed_sentence: current_sample.update(processed_sentence) if self._use_features is True: features = self.features_db[idx] if hasattr(self, "transformer_bbox_processor"): features["image_info_0"] = self.transformer_bbox_processor( features["image_info_0"]) current_sample.update(features) processed = self.answer_processor({"answers": sample_info["genres"]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] return current_sample
def load_item(self, idx): sample_info = self.annotation_db[idx] current_sample = Sample() processed_caption = self.text_processor( {"text": sample_info["captions"][0]}) current_sample.text = processed_caption["text"] current_sample.caption_len = torch.tensor(len( processed_caption["text"]), dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample.answers = torch.stack([processed_caption["text"]]) return current_sample