def forward(self, images, image_scales, transitions=None): feature_list = self.encoder(images, image_scales) image_features = feature_list[0] assert len( feature_list) == 1, 'current model only support batch size 1' sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = image_features # it seems answers work as a place holder here # hence, it does not matter what it's size is sample.answers = torch.zeros((1, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(device) # set_trace() if transitions is not None: sample_list.transitions = transitions output = self.decoder(sample_list) tokens = output['captions'] caption = tokens.tolist()[0] caption = self.decoder.caption_processor(caption)['caption'] return caption
def predict(self, url, feat_name, get_features=False): with torch.no_grad(): detectron_features = get_detectron_features([url], self.detection_model, False, feat_name, self.cuda_device) # returns a single-element list detectron_features = detectron_features[0] sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to(self.cuda_device) tokens = self.caption_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() if not get_features: return tokens else: return tokens, detectron_features
def prepare_batch(self, batch): """ Can be possibly overriden in your child class Prepare batch for passing to model. Whatever returned from here will be directly passed to model's forward function Parameters ---------- batch: dict Dictionary containing information about the next sample in batched form Returns ------- data: dict Contains variables in the following format 'texts': The main text of the batch which can be a question in most of the cases 'image_features': Image features for the current batch 'image_dim': Max BBoxes for the images 'contexts': Contains context relevant to current batch, in VisDial this will be the history of the dialog till now obs: tensor Tensor containing observations for the current batch """ # Should be a SampleList if not isinstance(batch, SampleList): # Try converting to SampleList batch = SampleList(batch) batch = batch.to(self._device) return batch
def predict(self, img_paths, qud): """ We enable batch prediction here :return: """ with torch.no_grad(): detectron_features = self.get_detectron_features( img_paths) # a list of image features resnet_features = self.get_resnet_features( img_paths) # [batch_size, 196, 2048] sample_list = [] for i in range(len(detectron_features)): sample = Sample() processed_text = self.vqa_demo.text_processor({"text": qud}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features[i] sample.image_info_0 = Sample( {"max_features": torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet_features[i] sample_list.append(sample) sample_list = SampleList(sample_list) sample_list = sample_list.to("cuda") scores = self.vqa_demo.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) batch_probs = [] batch_answers = [] for i in range(scores.shape[0]): top_indices = indices[i] top_scores = actual[i] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.vqa_demo.answer_processor.idx2word( top_indices[idx].item())) batch_probs.append(probs) batch_answers.append(answers) ## if the memory becomes an issue, we then clear this # gc.collect() # torch.cuda.empty_cache() # list is of batch_size # [[ans_1, ans_2], [ans_1, ans2]] return batch_probs, batch_answers
def __call__(self, batch): sample_list = SampleList(batch) for key in self._IDENTICAL_VALUE_KEYS: sample_list[key + "_"] = sample_list[key] sample_list[key] = sample_list[key][0] return sample_list
def _load_objects(self, idx): image_info = self._get_image_info(idx) image_height = image_info["height"] image_width = image_info["width"] object_map = {} objects = [] for obj in image_info["objects"]: obj["synsets"] = self.synset_processor({"tokens": obj["synsets"]})["text"] obj["names"] = self.name_processor({"tokens": obj["names"]})["text"] obj["height"] = obj["h"] / image_height obj.pop("h") obj["width"] = obj["w"] / image_width obj.pop("w") obj["y"] /= image_height obj["x"] /= image_width obj["attributes"] = self.attribute_processor( {"tokens": obj["attributes"]})["text"] obj = Sample(obj) object_map[obj["object_id"]] = obj objects.append(obj) objects = SampleList(objects) return objects, object_map
def test_nucleus_sampling(self): vocab = text_utils.VocabFromText(self.VOCAB_EXAMPLE_SENTENCES) model_config = self.config.model_attributes.butd model = TestDecoderModel(model_config, vocab) model.build() model.to("cuda") model.eval() sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = torch.randn(100, 2048) sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) tokens = model(sample_list)["captions"] # these are expected tokens for sum_threshold = 0.5 expected_tokens = [ 1.0000e+00, 2.9140e+03, 5.9210e+03, 2.2040e+03, 5.0550e+03, 9.2240e+03, 4.5120e+03, 1.8200e+02, 3.6490e+03, 6.4090e+03, 2.0000e+00 ] self.assertEqual(tokens[0].tolist(), expected_tokens)
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/clevr/logit_bce"] accuracy = output["metrics"]["train/clevr/accuracy"] np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ -0.7598285675048828, -0.07029829174280167, -0.20382611453533173, -0.06990239024162292, 0.7965695858001709, 0.4730074405670166, -0.30569902062416077, 0.4244227707386017, 0.6511023044586182, 0.2480515092611313, -0.5087617635726929, -0.7675772905349731, 0.4361543357372284, 0.0018743239343166351, 0.6774630546569824, 0.30618518590927124, -0.398895800113678, -0.13120117783546448, -0.4433199465274811, -0.25969570875167847, 0.6798790097236633, -0.34090861678123474, 0.0384102463722229, 0.2484571784734726, 0.0456063412129879, -0.428459107875824, -0.026385333389043808, -0.1570669412612915, -0.2377825379371643, 0.3231588304042816, 0.21098048985004425, -0.712349534034729 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def predict(self, url): with torch.no_grad(): detectron_features = self.get_detectron_features(url) sample = Sample() sample.dataset_name = "coco" sample.dataset_type = "test" sample.image_feature_0 = detectron_features sample.answers = torch.zeros((5, 10), dtype=torch.long) sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") tokens = self.pythia_model(sample_list)["captions"] gc.collect() torch.cuda.empty_cache() return tokens
def prepare_batch(self, batch): """ Can be possibly overriden in your child class Prepare batch for passing to model. Whatever returned from here will be directly passed to model's forward function. Currently moves the batch to proper device. Args: batch (SampleList): sample list containing the currently loaded batch Returns: sample_list (SampleList): Returns a sample representing current batch loaded """ # Should be a SampleList if not isinstance(batch, SampleList): # Try converting to SampleList batch = SampleList(batch) batch = batch.to(self._device) return batch
def predict(self, url, question): with torch.no_grad(): detectron_features = self.get_detectron_features(url) resnet_features = self.get_resnet_features(url) sample = Sample() processed_text = self.text_processor({"text": question}) sample.text = processed_text["text"] sample.text_len = len(processed_text["tokens"]) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample({ "max_features": torch.tensor(100, dtype=torch.long) }) sample.image_feature_1 = resnet_features sample_list = SampleList([sample]) sample_list = sample_list.to("cuda") scores = self.pythia_model(sample_list)["scores"] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] probs = [] answers = [] for idx, score in enumerate(top_scores): probs.append(score.item()) answers.append( self.answer_processor.idx2word(top_indices[idx].item()) ) gc.collect() torch.cuda.empty_cache() return probs, answers
def test_forward(self): model_config = self.config.model_attributes.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses_and_metrics() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/logit_bce"] accuracy = output["metrics"]["train/accuracy"] np.testing.assert_almost_equal(loss.item(), 23.4751, decimal=4) np.testing.assert_almost_equal(accuracy.item(), 0) self.assertEqual(scores.size(), torch.Size((1, 32))) expected_scores = [ 2.2298e-02, -2.4975e-01, -1.1960e-01, -5.0868e-01, -9.3013e-02, 1.3202e-02, -1.7536e-01, -3.1180e-01, 1.5369e-01, 1.4900e-01, 1.9006e-01, -1.9457e-01, 1.4924e-02, -1.1032e-01, 1.3777e-01, -3.6255e-01, -2.9327e-01, 5.6247e-04, -4.8732e-01, 4.0949e-01, -1.1069e-01, 2.9696e-01, 4.1903e-02, 6.7062e-02, 7.0094e-01, -1.9898e-01, -2.9502e-03, -3.9040e-01, 1.2218e-01, 3.7895e-02, 2.4472e-02, 1.7213e-01 ] np.testing.assert_almost_equal(scores[0].tolist(), expected_scores, decimal=5)
def _load_regions(self, idx, object_map, relationship_map): if self._return_scene_graph is None: return None, None image_info = self._get_image_info(idx) image_height = image_info["height"] image_width = image_info["width"] region_map = {} regions = [] for region in image_info["regions"]: for synset in region["synsets"]: synset["entity_name"] = self.name_processor( {"tokens": [synset["entity_name"]]})["text"] synset["synset_name"] = self.synset_processor( {"tokens": [synset["synset_name"]]})["text"] region["height"] /= image_height region["width"] /= image_width region["y"] /= image_height region["x"] /= image_width relationships = [] objects = [] for relationship_idx in region["relationships"]: relationships.append(relationship_map[relationship_idx]) for object_idx in region["objects"]: objects.append(object_map[object_idx]) region["relationships"] = relationships region["objects"] = objects region["phrase"] = self.text_processor({"text": region["phrase"]})["text"] region = Sample(region) region_map[region["region_id"]] = region regions.append(region) regions = SampleList(regions) return regions, region_map
def _load_relationships(self, idx, object_map): if self._return_relationships is None and self._return_scene_graph is None: return None, None image_info = self._get_image_info(idx) relationship_map = {} relationships = [] for relationship in image_info["relationships"]: relationship["synsets"] = self.synset_processor( {"tokens": relationship["synsets"]})["text"] relationship["predicate"] = self.predicate_processor( {"tokens": relationship["predicate"]})["text"] relationship["object"] = object_map[relationship["object_id"]] relationship["subject"] = object_map[relationship["subject_id"]] relationship = Sample(relationship) relationship_map[relationship["relationship_id"]] = relationship relationships.append(relationship) relationships = SampleList(relationships) return relationships, relationship_map
def getAnswers(self, image, question, meta=None): first = time.time() meta = meta or str(image) image = Image.open(image).convert('RGB') if isinstance(image, str) else \ image.convert('RGB') print(f'Tiki : Getting Answers : {meta}, {question}') with torch.no_grad(): detectron_features = self.get_detectron_features(image) resnet152_features = self.get_resnet152_features(image) start = time.time() sample = Sample() processed_text = self.text_processor({'text': question}) sample.text = processed_text['text'] sample.text_len = len(processed_text['tokens']) sample.image_feature_0 = detectron_features sample.image_info_0 = Sample( {'max_features': torch.tensor(100, dtype=torch.long)}) sample.image_feature_1 = resnet152_features sample_list = SampleList([sample]) sample_list = sample_list.to(self.device.type) scores = self.pythiaVQA_model(sample_list)['scores'] scores = torch.nn.functional.softmax(scores, dim=1) actual, indices = scores.topk(5, dim=1) top_indices = indices[0] top_scores = actual[0] answers = [] for rank, score in enumerate(top_scores): answers.append({ 'rank': rank, 'answer': self.answer_processor.idx2word(top_indices[rank].item()), 'probability': score.item() }) answer = answers[0]['answer'] end = time.time() print( f'Tiki : Getting Answers : PythiaVQA - Finished in {end-start:7.3f} Seconds' ) processing['PythiaVQA'] = end - start gc.collect() torch.cuda.empty_cache() last = time.time() processing['InferTime'] = last - first return question, answer, answers
def evaluate_full(self, loader, use_tqdm=False): meter = Meter() # metrics = ['vqamb_map', 'vqamb_f1'] # hardcode metrics for now metrics = ['accuracy'] # metrics = ['vqamb_f1pt'] print(len(loader)) with torch.no_grad(): self.model.eval() tot_preds = [] tot_targets = [] tot_ids = [] tot_att_pt = [] tot_att_img = [] tot_bbox_gt = [] tot_bbox_pt = [] tot_bbox_img = [] tot_part = [] # tot_qa_ids = [] for batch in tqdm(loader, disable=not use_tqdm): report = self._forward_pass(batch) tot_preds.append(report.scores) tot_targets.append(report.targets) # tot_ids.extend(report.qa_id) # tot_att_pt.append(report.att) # tot_att_img.append(report.att_img) # tot_bbox_gt.append(report.gt_bbox) # tot_bbox_img.append(report.img_bbox) # tot_bbox_pt.append(report.pt_bbox) # tot_part.append(report.part) # tot_bbox_gt.append(report.gt_bbox) # tot_ptpath.append(report.ptpath) # tot_bbox_pt.append(report.bboxes) # tot_bbox_gt.append(report.gt_bbox) # tot_qa_ids.extend(report.qa_id) tot_preds = torch.cat(tot_preds, dim=0) tot_targets = torch.cat(tot_targets, dim=0) # tot_att_pt = torch.cat(tot_att_pt, dim=0) # tot_att_img = torch.cat(tot_att_img, dim=0) # tot_att_pt = torch.cat(tot_att_pt, dim=0) # tot_bbox_pt = torch.cat(tot_bbox_pt, dim=0) # tot_bbox_gt = torch.cat(tot_bbox_gt, dim=0) # tot_bbox_img = torch.cat(tot_bbox_img, dim=0) # Find bounding box with max attention # max_att_pt = tot_att_pt.argmax(dim=1) # max_bbox_pt = tot_bbox_pt[torch.arange(tot_bbox_pt.size(0)), max_att_pt] ''' torch.save(tot_att_pt, 'tot_pt_att_objpartdev.pt') torch.save(tot_bbox_pt, 'tot_ptbboxes_objpartdev.pt') tot_part = sum(tot_part, []) torch.save(torch.Tensor(tot_part), 'tot_part_objpartdev.pt') ''' # torch.save(tot_att_pt, 'tot_att_pt_localqafinal.pt') # torch.save(tot_att_img, 'tot_att_img_pythiaptfinal.pt') # torch.save(tot_bbox_pt, 'tot_bbox_pt_localqafinal.pt') # torch.save(tot_bbox_img, 'tot_bbox_img_pythia_ptfinal.pt') # torch.save(tot_bbox_gt, 'tot_bboxgt_localqafinal.pt') # torch.save(tot_preds, 'tot_preds_localqafinal.pt') # torch.save(tot_targets, 'tot_targets_localqafinal.pt') # torch.save(max_bbox_pt, 'max_pt_bbox_pythiaptfinal.pt') # torch.save(tot_bbox_gt, 'gt_bbox_pythiaptfinal.pt') # torch.save(tot_preds, 'tot_preds_localqa.pt') # torch.save(tot_targets, 'tot_targets_localqa.pt') # torch.save(tot_ptpath, 'tot_ptpath_vqambnew.pt') # torch.save(tot_att, 'tot_att_vqambnew.pt') # tot_qa_ids = torch.Tensor(tot_qa_ids) # torch.save(tot_qa_ids, 'tot_qa_ids.pt') model_output = {"scores": tot_preds} sample = Sample({"targets": tot_targets}) # "qa_index": tot_qa_index}) # "dataset_type": report.dataset_type, "dataset_name": report.dataset_name}) sample_list = SampleList([sample]) sample_list.add_field('dataset_type', report.dataset_type) sample_list.add_field('dataset_name', report.dataset_name) metric_fn = Metrics(metrics) full_met = metric_fn(sample_list, model_output) self.writer.write(full_met) if report.dataset_type == 'test': return meter.update(full_met) stop = self.early_stopping(self.current_iteration, meter) should_break = False if stop is True: self.writer.write("Early stopping activated") should_break = True self.model.train() return should_break