def add_sample_details(self, sample_info, sample): # 1. Load text (question words) # breaking change from VQA2Dataset: # load the entire question string, not tokenized questions, since we # switch to BERT tokenizer in M4C and do online tokenization question_str = (sample_info['question'] if 'question' in sample_info else sample_info['question_str']) processed_question = self.text_processor({"question": question_str}) sample.text = processed_question['token_inds'] sample.text_len = processed_question['token_num'] # 2. Load object # object bounding box information sample.obj_bbox_coordinates = self.copy_processor( {"blob": sample_info["obj_normalized_boxes"]})["blob"] # 3. Load OCR assert self.use_ocr and self.use_ocr_info, \ 'use_ocr and use_ocr_info must be both True for M4CTextVQADataset' # Preprocess OCR tokens ocr_tokens = [ self.ocr_token_processor({"text": token})["text"] for token in sample_info["ocr_tokens"] ] # Get FastText embeddings for OCR tokens context = self.context_processor({"tokens": ocr_tokens}) sample.context = context["text"] sample.context_tokens = context["tokens"] sample.context_tokens_enc = enc_obj2bytes(context["tokens"]) sample.context_feature_0 = context["text"] sample.context_info_0 = Sample() sample.context_info_0.max_features = context["length"] # Get PHOC embeddings for OCR tokens context_phoc = self.phoc_processor({"tokens": ocr_tokens}) sample.context_feature_1 = context_phoc["text"] sample.context_info_1 = Sample() sample.context_info_1.max_features = context_phoc["length"] # OCR order vectors # TODO remove order_vectors -- it is no longer needed in M4C order_vectors = np.eye(len(sample.context_tokens), dtype=np.float32) order_vectors = torch.from_numpy(order_vectors) order_vectors[context["length"]:] = 0 sample.order_vectors = order_vectors # OCR bounding box information if 'ocr_normalized_boxes' in sample_info: # New imdb format: OCR bounding boxes are already pre-computed max_len = self.config.processors.answer_processor.params.max_length sample.ocr_bbox_coordinates = self.copy_processor( {"blob": sample_info['ocr_normalized_boxes']})["blob"][:max_len] else: # Old imdb format: OCR bounding boxes are computed on-the-fly # from ocr_info sample.ocr_bbox_coordinates = self.bbox_processor( {"info": sample_info["ocr_info"]})["bbox"].coordinates # sample.iou_info = box_iou(sample.obj_bbox_coordinates, sample.ocr_bbox_coordinates) return sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() if self._dataset_type != "test": text_processor_argument = {"tokens": sample_info["caption_tokens"]} processed_caption = self.text_processor(text_processor_argument) current_sample.text = processed_caption["text"] current_sample.caption_id = torch.tensor(sample_info["caption_id"], dtype=torch.int) current_sample.caption_len = torch.tensor(len( sample_info["caption_tokens"]), dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add reference captions to sample current_sample = self.add_reference_caption(sample_info, current_sample) return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() text_processor_argument = {"tokens": sample_info["question_tokens"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def add_ocr_details(self, sample_info, sample): if self.use_ocr: # Preprocess OCR tokens ocr_tokens = [ self.ocr_token_processor({"text": token})["text"] for token in sample_info["ocr_tokens"] ] # Get embeddings for tokens context = self.context_processor({"tokens": ocr_tokens}) sample.context = context["text"] sample.context_tokens = context["tokens"] sample.context_feature_0 = context["text"] sample.context_info_0 = Sample() sample.context_info_0.max_features = context["length"] order_vectors = torch.eye(len(sample.context_tokens)) order_vectors[context["length"] :] = 0 sample.order_vectors = order_vectors if self.use_ocr_info and "ocr_info" in sample_info: sample.ocr_bbox = self.bbox_processor({"info": sample_info["ocr_info"]})[ "bbox" ] return sample
def _load_objects(self, idx): image_info = self._get_image_info(idx) image_height = image_info["height"] image_width = image_info["width"] object_map = {} objects = [] for obj in image_info["objects"]: obj["synsets"] = self.synset_processor({"tokens": obj["synsets"]})["text"] obj["names"] = self.name_processor({"tokens": obj["names"]})["text"] obj["height"] = obj["h"] / image_height obj.pop("h") obj["width"] = obj["w"] / image_width obj.pop("w") obj["y"] /= image_height obj["x"] /= image_width obj["attributes"] = self.attribute_processor( {"tokens": obj["attributes"]})["text"] obj = Sample(obj) object_map[obj["object_id"]] = obj objects.append(obj) objects = SampleList(objects) return objects, object_map
def predict(self, img_paths, qud): """ We enable batch prediction here :return: """ with torch.no_grad(): detectron_features = self.get_detectron_features( img_paths) # a list of image features resnet_features = self.get_resnet_features( img_paths) # [batch_size, 196, 2048] sample_list = [] for i in range(len(detectron_features)): sample = Sample() processed_text = self.vqa_demo.text_processor({"text": qud}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features[i] sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features[i] sample_list.append(sample) sample_list = SampleList(sample_list) sample_list = sample_list.to("cuda") scores = self.vqa_demo.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) batch_probs = [] batch_answers = [] for i in range(scores.shape[0]): top_indices = indices[i] top_scores = actual[i] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.vqa_demo.answer_processor.idx2word( top_indices[idx].item())) batch_probs.append(probs) batch_answers.append(answers) ## if the memory becomes an issue, we then clear this # gc.collect() # torch.cuda.empty_cache() # list is of batch_size # [[ans_1, ans_2], [ans_1, ans2]] return batch_probs, batch_answers
def load_item(self, idx): sample_info = self.imdb[idx] sample_info = self.preprocess_sample_info(sample_info) current_sample = Sample() # breaking change from VQA2Dataset: load question_id current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = str(sample_info["image_id"]) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample = self.add_sample_details(sample_info, current_sample) current_sample = self.add_answer_info(sample_info, current_sample) current_sample = self.add_anchor_graph(sample_info, current_sample) # only the 'max_features' key is needed # pop other keys to minimize data loading overhead for k in list(current_sample.image_info_0): if k != 'max_features': current_sample.image_info_0.pop(k) for k in list(current_sample.image_info_1): if k != 'max_features': current_sample.image_info_1.pop(k) return current_sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() # breaking change from VQA2Dataset: load question_id current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample = self.add_sample_details(sample_info, current_sample) current_sample = self.add_answer_info(sample_info, current_sample) current_sample['obj_ocr_edge_feat'] = torch.from_numpy( current_sample.image_info_2['obj_ocr_edge_feat']).float( ) # [100, 50, 5] current_sample['ocr_obj_edge_feat'] = torch.from_numpy( current_sample.image_info_2['ocr_obj_edge_feat']).float( ) # [50, 100, 5] try: current_sample['gt_answers'] = sample_info['valid_answers'] except: current_sample['gt_answers'] = ['valid_answers'] return current_sample
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/clevr/logit_bce"] accuracy = output["metrics"]["train/clevr/accuracy"] np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ -0.7598285675048828, -0.07029829174280167, -0.20382611453533173, -0.06990239024162292, 0.7965695858001709, 0.4730074405670166, -0.30569902062416077, 0.4244227707386017, 0.6511023044586182, 0.2480515092611313, -0.5087617635726929, -0.7675772905349731, 0.4361543357372284, 0.0018743239343166351, 0.6774630546569824, 0.30618518590927124, -0.398895800113678, -0.13120117783546448, -0.4433199465274811, -0.25969570875167847, 0.6798790097236633, -0.34090861678123474, 0.0384102463722229, 0.2484571784734726, 0.0456063412129879, -0.428459107875824, -0.026385333389043808, -0.1570669412612915, -0.2377825379371643, 0.3231588304042816, 0.21098048985004425, -0.712349534034729 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def add_ocr_details(self, sample_info, sample): assert self.use_ocr and self.use_ocr_info, \ 'use_ocr and use_ocr_info must be both True for Dataset' # Preprocess OCR tokens ocr_tokens = [ self.ocr_token_processor({"text": token})["text"] for token in sample_info["ocr_tokens"] ] # Get FastText embeddings for tokens context = self.context_processor({"tokens": ocr_tokens}) sample.context = context["text"] # torch.Size([50, 300]) sample.context_tokens = context["tokens"] sample.context_tokens_enc = enc_obj2bytes(context["tokens"]) sample.context_feature_0 = context["text"] sample.context_info_0 = Sample() sample.context_info_0.max_features = context["length"] # Get PHOC embeddings for OCR tokens context_phoc = self.phoc_processor({"tokens": ocr_tokens}) sample.context_phoc = context_phoc["text"] sample.context_info_phoc = Sample() sample.context_info_phoc.max_features = context_phoc["length"] # if 'ocr_normalized_boxes' in sample_info: # max_len = self.config.processors.answer_processor.params.max_length # sample.ocr_bbox = self.copy_processor( # {"blob": sample_info['ocr_normalized_boxes']} # )["blob"][:max_len] if "ocr_info" in sample_info: sample.ocr_bbox = self.bbox_processor({ "info": sample_info["ocr_info"], "feats": context["text"], "img_id": sample.image_id, "obj_bbox": sample.obj_bbox })["bbox"] return sample
def predict(self, url, feat_name, get_features=False): with torch.no_grad(): detectron_features = get_detectron_features([url], self.detection_model, False, feat_name, self.cuda_device) # returns a single-element list detectron_features = detectron_features[0] sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(self.cuda_device) tokens = self.caption_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() if not get_features: return tokens else: return tokens, detectron_features
def test_nucleus_sampling(self): vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES) model_config = self.config.model_attributes.butd model = TestDecoderModel(model_config, vocab) model.build() model.to("cuda") model.eval() sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = torch.randn(100, 2048) sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) tokens = model(sample_list)["captions"] # these are expected tokens for sum_threshold = 0.5 expected_tokens = [ 1.0000e+00, 2.9140e+03, 5.9210e+03, 2.2040e+03, 5.0550e+03, 9.2240e+03, 4.5120e+03, 1.8200e+02, 3.6490e+03, 6.4090e+03, 2.0000e+00 ] self.assertEqual(tokens[0].tolist(), expected_tokens)
def build_bbox_tensors(infos, max_length): num_bbox = min(max_length, len(infos)) # After num_bbox, everything else should be zero coord_tensor = torch.zeros((max_length, 4), dtype=torch.float) width_tensor = torch.zeros(max_length, dtype=torch.float) height_tensor = torch.zeros(max_length, dtype=torch.float) bbox_types = ["xyxy"] * max_length infos = infos[:num_bbox] sample = Sample() for idx, info in enumerate(infos): bbox = info["bounding_box"] x = bbox["top_left_x"] y = bbox["top_left_y"] width = bbox["width"] height = bbox["height"] coord_tensor[idx][0] = x coord_tensor[idx][1] = y coord_tensor[idx][2] = x + width coord_tensor[idx][3] = y + height width_tensor[idx] = width height_tensor[idx] = height sample.coordinates = coord_tensor sample.width = width_tensor sample.height = height_tensor sample.bbox_types = bbox_types return sample
def forward(self, images, image_scales, transitions=None): feature_list = self.encoder(images, image_scales) image_features = feature_list[0] assert len( feature_list) == 1, 'current model only support batch size 1' sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = image_features # it seems answers work as a place holder here # hence, it does not matter what it's size is sample.answers = torch.zeros((1, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(device) # set_trace() if transitions is not None: sample_list.transitions = transitions output = self.decoder(sample_list) tokens = output['captions'] caption = tokens.tolist()[0] caption = self.decoder.caption_processor(caption)['caption'] return caption
def test_caption_bleu4(self): path = os.path.join( os.path.abspath(__file__), "../../../pythia/common/defaults/configs/datasets/captioning/coco.yml", ) with open(os.path.abspath(path)) as f: config = yaml.load(f, Loader=yaml.FullLoader) config = ConfigNode(config) captioning_config = config.dataset_attributes.coco caption_processor_config = captioning_config.processors.caption_processor vocab_path = os.path.join(os.path.abspath(__file__), "..", "..", "data", "vocab.txt") caption_processor_config.params.vocab.vocab_file = os.path.abspath( vocab_path) caption_processor = CaptionProcessor(caption_processor_config.params) registry.register("coco_caption_processor", caption_processor) caption_bleu4 = metrics.CaptionBleu4Metric() expected = Sample() predicted = dict() # Test complete match expected.answers = torch.empty((5, 5, 10)) expected.answers.fill_(4) predicted["scores"] = torch.zeros((5, 10, 19)) predicted["scores"][:, :, 4] = 1.0 self.assertEqual( caption_bleu4.calculate(expected, predicted).item(), 1.0) # Test partial match expected.answers = torch.empty((5, 5, 10)) expected.answers.fill_(4) predicted["scores"] = torch.zeros((5, 10, 19)) predicted["scores"][:, 0:5, 4] = 1.0 self.assertAlmostEqual( caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4)
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/logit_bce"] accuracy = output["metrics"]["train/accuracy"] np.testing.assert_almost_equal(loss.item(), 23.4751, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ 2.2298e-02, -2.4975e-01, -1.1960e-01, -5.0868e-01, -9.3013e-02, 1.3202e-02, -1.7536e-01, -3.1180e-01, 1.5369e-01, 1.4900e-01, 1.9006e-01, -1.9457e-01, 1.4924e-02, -1.1032e-01, 1.3777e-01, -3.6255e-01, -2.9327e-01, 5.6247e-04, -4.8732e-01, 4.0949e-01, -1.1069e-01, 2.9696e-01, 4.1903e-02, 6.7062e-02, 7.0094e-01, -1.9898e-01, -2.9502e-03, -3.9040e-01, 1.2218e-01, 3.7895e-02, 2.4472e-02, 1.7213e-01 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] current_sample.qa_id = data['qa_id'] # process question question = data["question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['answer']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) if self.config.spatial: point = data['point'] # current_sample.point = point detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- return current_sample
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def load_item(self, idx): sample_info = self.imdb[idx] sample_info = self.preprocess_sample_info(sample_info) current_sample = Sample() # breaking change from VQA2Dataset: load question_id current_sample.question_id = torch.tensor( sample_info["question_id"], dtype=torch.int ) if isinstance(sample_info["image_id"], int): current_sample.image_id = str(sample_info["image_id"]) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample = self.add_sample_details(sample_info, current_sample) current_sample = self.add_answer_info(sample_info, current_sample) # only the 'max_features' key is needed # pop other keys to minimize data loading overhead for k in list(current_sample.image_info_0): if k != 'max_features': current_sample.image_info_0.pop(k) for k in list(current_sample.image_info_1): if k != 'max_features': current_sample.image_info_1.pop(k) overlap_flag = torch.zeros(150, 150) obj_obj_relation = self.compute_similarity_by_cosine(current_sample.image_feature_0, current_sample.image_feature_0) ocr_ocr_relation = self.compute_similarity_by_cosine(current_sample.context_feature_0, current_sample.context_feature_0) obj_ocr_relation = self.overlap(current_sample.obj_bbox_coordinates, current_sample.ocr_bbox_coordinates) overlap_flag[:100, :100] = obj_obj_relation overlap_flag[100:, 100:] = ocr_ocr_relation overlap_flag[:100, 100:] = obj_ocr_relation overlap_flag[100:, :100] = obj_ocr_relation.transpose(1, 0) current_sample.overlap_flag = overlap_flag return current_sample
def _load_regions(self, idx, object_map, relationship_map): if self._return_scene_graph is None: return None, None image_info = self._get_image_info(idx) image_height = image_info["height"] image_width = image_info["width"] region_map = {} regions = [] for region in image_info["regions"]: for synset in region["synsets"]: synset["entity_name"] = self.name_processor( {"tokens": [synset["entity_name"]]})["text"] synset["synset_name"] = self.synset_processor( {"tokens": [synset["synset_name"]]})["text"] region["height"] /= image_height region["width"] /= image_width region["y"] /= image_height region["x"] /= image_width relationships = [] objects = [] for relationship_idx in region["relationships"]: relationships.append(relationship_map[relationship_idx]) for object_idx in region["objects"]: objects.append(object_map[object_idx]) region["relationships"] = relationships region["objects"] = objects region["phrase"] = self.text_processor({"text": region["phrase"]})["text"] region = Sample(region) region_map[region["region_id"]] = region regions.append(region) regions = SampleList(regions) return regions, region_map
def _load_relationships(self, idx, object_map): if self._return_relationships is None and self._return_scene_graph is None: return None, None image_info = self._get_image_info(idx) relationship_map = {} relationships = [] for relationship in image_info["relationships"]: relationship["synsets"] = self.synset_processor( {"tokens": relationship["synsets"]})["text"] relationship["predicate"] = self.predicate_processor( {"tokens": relationship["predicate"]})["text"] relationship["object"] = object_map[relationship["object_id"]] relationship["subject"] = object_map[relationship["subject_id"]] relationship = Sample(relationship) relationship_map[relationship["relationship_id"]] = relationship relationships.append(relationship) relationships = SampleList(relationships) return relationships, relationship_map
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() # breaking change from VQA2Dataset: load question_id current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) current_sample = self.add_sample_details(sample_info, current_sample) current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def predict(self, url): with torch.no_grad(): detectron_features = self.get_detectron_features(url) sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") tokens = self.pythia_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() return tokens
def get_item(self, idx): data = self.questions[idx] # Each call to get_item from dataloader returns a Sample class object which # collated by our special batch collator to a SampleList which is basically # a attribute based batch in layman terms current_sample = Sample() question = data["question"] tokens = tokenize(question, keep=[";", ","], remove=["?", "."]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] processed = self.answer_processor({"answers": [data["answer"]]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"] image_path = os.path.join(self.image_path, data["image_filename"]) image = np.true_divide(Image.open(image_path).convert("RGB"), 255) image = image.astype(np.float32) current_sample.image = torch.from_numpy(image.transpose(2, 0, 1)) return current_sample
def load_item(self, idx): sample = Sample() image_id = self.annotations[idx][0] image_folder = image_id.split('_')[0] caption = self.annotations[idx][1] tokens = tokenize(caption) tokens = ['<s>'] + tokens + ['</s>'] # use text_processor to process caption # pad sequence, convert token to indices and add SOS, EOS token # text_processor already contains a pre-processor to tokenize caption caption_p = self.text_processor({'tokens': tokens}) sample.text = caption_p['text'] sample.caption_len = torch.tensor(len(tokens), dtype=torch.int) # sample.target = caption_p['text'] sample.answers = torch.stack([caption_p['text']]) # generate image features image_path = os.path.join(self.image_dir, image_folder, image_id) image, image_scale = self._image_transform(image_path) with torch.no_grad(): image_features = self.feature_extractor([image], [image_scale]) image_features = image_features[0] sample.image_feature_0 = image_features.cpu() return sample
def build_bbox_tensors(infos, max_length, feats, img_id, obj_bbox): # num of ocr bbox num_bbox = min(max_length, len(infos)) # ocr bbox coord_tensor = torch.zeros((max_length, 4), dtype=torch.float) infos = infos[:num_bbox] sample = Sample() for idx, info in enumerate(infos): bbox = info["bounding_box"] if "top_left_x" in bbox: x = bbox["top_left_x"] # key might be 'topLeftX' y = bbox["top_left_y"] # key might be 'topLeftY' else: x = bbox["topLeftX"] y = bbox["topLeftY"] width = bbox["width"] height = bbox["height"] coord_tensor[idx][0] = x coord_tensor[idx][1] = y coord_tensor[idx][2] = x + width coord_tensor[idx][3] = y + height sample.coordinates = coord_tensor sample.ocr_mask = num_bbox image_path_org = './data/open_images/textvqa_gcy/' # image_path_org = './data/open_images/GT_OBJ_FRCN/' # image_path_org = './data/open_images/visual_genome/' oo_edge_path = image_path_org + 'edge_oo/' ot_edge_path = image_path_org + 'edge_ot/' tt_edge_path = image_path_org + 'edge_tt/' to_edge_path = image_path_org + 'edge_to/' set_name = search_file(image_path_org, img_id) knn_k = 5 try: oo_node_matrix = torch.load(oo_edge_path + img_id + '_oo.pdh') sample.edge_oo = oo_node_matrix oo_feats = torch.load(oo_edge_path + img_id + '_oofeats.pdh') sample.edge_oofeats = oo_feats ot_node_matrix = torch.load(ot_edge_path + img_id + '_ot.pdh') sample.edge_ot = ot_node_matrix ot_feats = torch.load(ot_edge_path + img_id + '_otfeats.pdh') sample.edge_otfeats = ot_feats tt_node_matrix = torch.load(tt_edge_path + img_id + '_tt.pdh') sample.edge_tt = tt_node_matrix tt_feats = torch.load(tt_edge_path + img_id + '_ttfeats.pdh') sample.edge_ttfeats = tt_feats to_node_matrix = torch.load(to_edge_path + img_id + '_to.pdh') sample.edge_to = to_node_matrix to_feats = torch.load(to_edge_path + img_id + '_tofeats.pdh') sample.edge_tofeats = to_feats except: #Todo: generate obj-obj relation edge oo_node_matrix = finde_k_nearest_node(obj_bbox, knn_k) sample.edge_oo = oo_node_matrix oo_edge_file_name = oo_edge_path + img_id + "_oo.pdh" torch.save(oo_node_matrix, oo_edge_file_name) obj_obj_feat_variable = gen_oo_edge_feature(obj_bbox, oo_node_matrix, knn_k=knn_k) oo_edge_file_name = oo_edge_path + img_id + "_oofeats.pdh" torch.save(obj_obj_feat_variable, oo_edge_file_name) sample.edge_oofeats = obj_obj_feat_variable #Todo: generate object-text relation edge ot_node_matrix = dc_finde_k_nearest_node(obj_bbox, coord_tensor, knn_k) sample.edge_ot = ot_node_matrix ot_edge_file_name = ot_edge_path + img_id + "_ot.pdh" torch.save(ot_node_matrix, ot_edge_file_name) obj_text_feat_variable = gen_ot_edge_feature(obj_bbox, coord_tensor, ot_node_matrix, knn_k=knn_k) ot_edge_file_name = ot_edge_path + img_id + "_otfeats.pdh" torch.save(obj_text_feat_variable, ot_edge_file_name) sample.edge_otfeats = obj_text_feat_variable #Todo: generate text-text relation edge tt_node_matrix = finde_k_nearest_node(coord_tensor, knn_k) sample.edge_tt = tt_node_matrix tt_edge_file_name = tt_edge_path + img_id + "_tt.pdh" torch.save(tt_node_matrix, tt_edge_file_name) text_text_edge_feature = gen_tt_edge_feature(coord_tensor, tt_node_matrix, knn_k=knn_k) tt_edge_file_name = tt_edge_path + img_id + "_ttfeats.pdh" torch.save(text_text_edge_feature, tt_edge_file_name) sample.edge_ttfeats = text_text_edge_feature #Todo: generate text-obj relation edge to_node_matrix = dc_finde_k_nearest_node(coord_tensor, obj_bbox, knn_k) sample.edge_to = to_node_matrix to_edge_file_name = to_edge_path + img_id + "_to.pdh" torch.save(to_node_matrix, to_edge_file_name) text_obj_feat_variable = gen_to_edge_feature(coord_tensor, obj_bbox, to_node_matrix, knn_k=knn_k) to_edge_file_name = to_edge_path + img_id + "_tofeats.pdh" torch.save(text_obj_feat_variable, to_edge_file_name) sample.edge_tofeats = text_obj_feat_variable return sample
def load_item(self, idx): sample_info = self.imdb[idx] current_sample = Sample() current_sample.dataset_name = self.dataset if self.dataset == 'train_vqa': text_processor_argument = { "tokens": sample_info["question_tokens"] } processed_question = self.text_processor(text_processor_argument) current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.text = processed_question["text"] current_sample.question_text = sample_info["question_str"] current_sample.text_sq = current_sample.text current_sample.text_oq = current_sample.text current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["question_str"] current_sample.other_question = sample_info["question_str"] elif self.dataset == 'train_introspect' or self.dataset == 'test': text_processor_argument = { "text": sample_info["main_question_str"] } processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] current_sample.question_text = sample_info["main_question_str"] current_sample.reasoning_question = sample_info[ "main_question_str"] current_sample.reasoning_answer = sample_info["main_answer_str"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["other_question_str"] current_sample.text_len = torch.tensor(len( sample_info["main_question_tokens"]), dtype=torch.int) else: text_processor_argument = {"text": sample_info["question_str"]} processed_question = self.text_processor(text_processor_argument) current_sample.text = processed_question["text"] if "sub_question_str" in sample_info: text_processor_argument_sq = { "text": sample_info["sub_question_str"] } processed_question_sq = self.text_processor( text_processor_argument_sq) current_sample.text_sq = processed_question_sq["text"] if "other_question_str" in sample_info: text_processor_argument_oq = { "text": sample_info["other_question_str"] } processed_question_oq = self.text_processor( text_processor_argument_oq) current_sample.text_oq = processed_question_oq["text"] else: current_sample.text_oq = current_sample.text_sq current_sample.question_text = sample_info["question_str"] current_sample.reasoning_question = sample_info["question_str"] current_sample.reasoning_answer = sample_info["answers"][0] current_sample.sub_question = sample_info["sub_question_str"] current_sample.other_question = sample_info["sub_question_str"] current_sample.text_len = torch.tensor(len( sample_info["question_tokens"]), dtype=torch.int) current_sample.question_id = torch.tensor(sample_info["question_id"], dtype=torch.int) if isinstance(sample_info["image_id"], int): current_sample.image_id = torch.tensor(sample_info["image_id"], dtype=torch.int) else: current_sample.image_id = sample_info["image_id"] if self._use_features is True: features = self.features_db[idx] current_sample.update(features) # Add details for OCR like OCR bbox, vectors, tokens here current_sample = self.add_ocr_details(sample_info, current_sample) # Depending on whether we are using soft copy this can add # dynamic answer space current_sample = self.add_answer_info(sample_info, current_sample) return current_sample
def get_item(self, idx): data = self.vqamb_data[idx] current_sample = Sample() # store queston and image id current_sample.img_id = data['id'] # current_sample.qa_id = data['qa_id'] # store points current_sample.point = data['point'] # data['points'] bbox = data['bbox'] current_sample.gt_bbox = torch.Tensor([bbox['x'], bbox['y'], bbox['x'] + bbox['w'], bbox['y'] + bbox['h']]) # process question question = data["pt_question"] tokens = tokenize(question, remove=["?"], keep=["'s"]) processed = self.text_processor({"tokens": tokens}) current_sample.text = processed["text"] # process answers processed = self.answer_processor({"answers": [data['ans']]}) current_sample.answers = processed["answers"] current_sample.targets = processed["answers_scores"][1:] # remove unknown index # Detectron features ---------------- # TODO: read in detectron image instead if detectron is to be built detectron_path = self.detectron_folder + str(data['id']) point = data['point'] # point = data['points'][0] if 'pt' in self.detectron_folder: detectron_path += ',' + str(point['x']) + ',' + str(point['y']) detectron_path += '.pt' detectron_feat = torch.load(detectron_path, map_location=torch.device('cpu')) # Pad features to fixed length if self.config.pad_detectron: if detectron_feat.shape[0] > 100: detectron_feat = detectron_feat[:100] elif detectron_feat.shape[0] < 100: pad = torch.zeros(100 - detectron_feat.shape[0], detectron_feat.shape[1]) detectron_feat = torch.cat([detectron_feat, pad], dim=0) current_sample.image_feature_0 = detectron_feat # --------------------------------------------- # read in bounding boxes (hardcoded for now) bbox_path = '' bbox_path += str(data['id']) + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.pt_bbox = bboxes # read in image bounding boxes bbox_path = '' bbox_path += str(data['id']) + '.pt' # + ',' + str(point['x']) + ',' + str(point['y']) + '.pt' bboxes = torch.load(bbox_path, map_location=torch.device('cpu')) if bboxes.shape[0] > 100: bboxes = bboxes[:100] elif bboxes.shape[0] < 100: pad = torch.zeros(100 - bboxes.shape[0], bboxes.shape[1]) bboxes = torch.cat([bboxes, pad], dim=0) current_sample.img_bbox = bboxes # Context features -------------------- if self.config.use_context: context_path = self.context_folder + str(data['id']) context_path += ',' + str(point['x']) + ',' + str(point['y']) context_path += '.pt' context_feat = torch.load(context_path, map_location=torch.device('cpu')) context_feat = context_feat.squeeze() orig_dim = context_feat.shape[0] if self.config.pad_context: if context_feat.shape[0] > 100: context_feat = context_feat[:100] elif context_feat.shape[0] < 100: pad = torch.zeros(100 - context_feat.shape[0], context_feat.shape[1]) context_feat = torch.cat([context_feat, pad], dim=0) current_sample.context_feature_0 = context_feat # --------------------------------------------- return current_sample
def evaluate_full(self, loader, use_tqdm=False): meter = Meter() # metrics = ['vqamb_map', 'vqamb_f1'] # hardcode metrics for now metrics = ['accuracy'] # metrics = ['vqamb_f1pt'] print(len(loader)) with torch.no_grad(): self.model.eval() tot_preds = [] tot_targets = [] tot_ids = [] tot_att_pt = [] tot_att_img = [] tot_bbox_gt = [] tot_bbox_pt = [] tot_bbox_img = [] tot_part = [] # tot_qa_ids = [] for batch in tqdm(loader, disable=not use_tqdm): report = self._forward_pass(batch) tot_preds.append(report.scores) tot_targets.append(report.targets) # tot_ids.extend(report.qa_id) # tot_att_pt.append(report.att) # tot_att_img.append(report.att_img) # tot_bbox_gt.append(report.gt_bbox) # tot_bbox_img.append(report.img_bbox) # tot_bbox_pt.append(report.pt_bbox) # tot_part.append(report.part) # tot_bbox_gt.append(report.gt_bbox) # tot_ptpath.append(report.ptpath) # tot_bbox_pt.append(report.bboxes) # tot_bbox_gt.append(report.gt_bbox) # tot_qa_ids.extend(report.qa_id) tot_preds = torch.cat(tot_preds, dim=0) tot_targets = torch.cat(tot_targets, dim=0) # tot_att_pt = torch.cat(tot_att_pt, dim=0) # tot_att_img = torch.cat(tot_att_img, dim=0) # tot_att_pt = torch.cat(tot_att_pt, dim=0) # tot_bbox_pt = torch.cat(tot_bbox_pt, dim=0) # tot_bbox_gt = torch.cat(tot_bbox_gt, dim=0) # tot_bbox_img = torch.cat(tot_bbox_img, dim=0) # Find bounding box with max attention # max_att_pt = tot_att_pt.argmax(dim=1) # max_bbox_pt = tot_bbox_pt[torch.arange(tot_bbox_pt.size(0)), max_att_pt] ''' torch.save(tot_att_pt, 'tot_pt_att_objpartdev.pt') torch.save(tot_bbox_pt, 'tot_ptbboxes_objpartdev.pt') tot_part = sum(tot_part, []) torch.save(torch.Tensor(tot_part), 'tot_part_objpartdev.pt') ''' # torch.save(tot_att_pt, 'tot_att_pt_localqafinal.pt') # torch.save(tot_att_img, 'tot_att_img_pythiaptfinal.pt') # torch.save(tot_bbox_pt, 'tot_bbox_pt_localqafinal.pt') # torch.save(tot_bbox_img, 'tot_bbox_img_pythia_ptfinal.pt') # torch.save(tot_bbox_gt, 'tot_bboxgt_localqafinal.pt') # torch.save(tot_preds, 'tot_preds_localqafinal.pt') # torch.save(tot_targets, 'tot_targets_localqafinal.pt') # torch.save(max_bbox_pt, 'max_pt_bbox_pythiaptfinal.pt') # torch.save(tot_bbox_gt, 'gt_bbox_pythiaptfinal.pt') # torch.save(tot_preds, 'tot_preds_localqa.pt') # torch.save(tot_targets, 'tot_targets_localqa.pt') # torch.save(tot_ptpath, 'tot_ptpath_vqambnew.pt') # torch.save(tot_att, 'tot_att_vqambnew.pt') # tot_qa_ids = torch.Tensor(tot_qa_ids) # torch.save(tot_qa_ids, 'tot_qa_ids.pt') model_output = {"scores": tot_preds} sample = Sample({"targets": tot_targets}) # "qa_index": tot_qa_index}) # "dataset_type": report.dataset_type, "dataset_name": report.dataset_name}) sample_list = SampleList([sample]) sample_list.add_field('dataset_type', report.dataset_type) sample_list.add_field('dataset_name', report.dataset_name) metric_fn = Metrics(metrics) full_met = metric_fn(sample_list, model_output) self.writer.write(full_met) if report.dataset_type == 'test': return meter.update(full_met) stop = self.early_stopping(self.current_iteration, meter) should_break = False if stop is True: self.writer.write("Early stopping activated") should_break = True self.model.train() return should_break
def getAnswers(self, image, question, meta=None): first = time.time() meta = meta or str(image) image = Image.open(image).convert('RGB') if isinstance(image, str) else \ image.convert('RGB') print(f'Tiki : Getting Answers : {meta}, {question}') with torch.no_grad(): detectron_features = self.get_detectron_features(image) resnet152_features = self.get_resnet152_features(image) start = time.time() sample = Sample() processed_text = self.text_processor({'text': question}) sample.text = processed_text['text'] sample.text_len = len(processed_text['tokens']) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {'max_features': torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet152_features sample_list = SampleList([sample]) sample_list = sample_list.to(self.device.type) scores = self.pythiaVQA_model(sample_list)['scores'] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] answers = [] for rank, score in enumerate(top_scores): answers.append({ 'rank': rank, 'answer': self.answer_processor.idx2word(top_indices[rank].item()), 'probability': score.item() }) answer = answers[0]['answer'] end = time.time() print( f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds' ) processing['PythiaVQA'] = end - start gc.collect() torch.cuda.empty_cache() last = time.time() processing['InferTime'] = last - first return question, answer, answers