def test_forward(self): model_config = self.config.model_config.cnn_lstm cnn_lstm = CNNLSTM(model_config) cnn_lstm.build() cnn_lstm.init_losses() self.assertTrue(isinstance(cnn_lstm, torch.nn.Module)) test_sample = Sample() test_sample.text = torch.randint(1, 79, (10, ), dtype=torch.long) test_sample.image = torch.randn(3, 320, 480) test_sample.targets = torch.randn(32) test_sample_list = SampleList([test_sample]) test_sample_list.dataset_type = "train" test_sample_list.dataset_name = "clevr" test_sample_list = test_sample_list.to(get_current_device()) cnn_lstm = cnn_lstm.to(get_current_device()) output = cnn_lstm(test_sample_list) scores = output["scores"] loss = output["losses"]["train/clevr/logit_bce"] np.testing.assert_almost_equal(loss.item(), 19.2635, decimal=4) self.assertEqual(scores.size(), torch.Size((1, 32)))
def test_pretrained_model(self): sample_list = SampleList() sample_list.add_field( "input_ids", torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(), ) sample_list.add_field("input_mask", torch.ones((1, 128)).long()) sample_list.add_field("segment_ids", torch.zeros(1, 128).long()) sample_list.add_field("image_feature_0", torch.rand((1, 100, 2048)).float()) sample_list.add_field( "lm_label_ids", torch.zeros((1, 128), dtype=torch.long).fill_(-1) ) self.pretrain_model.eval() self.pretrain_model = self.pretrain_model.to(get_current_device()) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "random" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.pretrain_model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("random/test/masked_lm_loss" in model_output["losses"]) self.assertTrue(model_output["losses"]["random/test/masked_lm_loss"] == 0)
def __call__(self, sample_list, *args, **kwargs): if not self._is_pl_enabled: # Move to proper device i.e. same as the model before passing sample_list = to_device(sample_list, get_current_device()) model_output = super().__call__(sample_list, *args, **kwargs) # Don't do anything fancy to output if it is pretrained if self.is_pretrained: return model_output # Make sure that the output from the model is a Mapping assert isinstance( model_output, collections.abc.Mapping ), "A dict must be returned from the forward of the model." if "losses" in model_output: if not self._logged_warning["losses_present"]: warnings.warn( "'losses' already present in model output. " "No calculation will be done in base model." ) self._logged_warning["losses_present"] = True assert isinstance( model_output["losses"], collections.abc.Mapping ), "'losses' must be a dict." elif hasattr(self, "losses"): model_output["losses"] = self.losses(sample_list, model_output) else: model_output["losses"] = {} return model_output
def test_pretrained_model(self): img_dim = 1024 model = UNITERModelBase(img_dim=img_dim) device = get_current_device() model.eval() model = model.to(device) bs = 8 num_feats = 100 max_sentence_len = 25 pos_dim = 7 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long).to(device) img_feat = torch.rand((bs, num_feats, img_dim)).to(device) img_pos_feat = torch.rand((bs, num_feats, pos_dim)).to(device) position_ids = torch.arange(0, input_ids.size(1), dtype=torch.long, device=device).unsqueeze(0) attention_mask = torch.ones( (bs, max_sentence_len + num_feats)).to(device) with torch.no_grad(): model_output = model(input_ids, position_ids, img_feat, img_pos_feat, attention_mask).final_layer self.assertEqual(model_output.shape, torch.Size([8, 125, 768]))
def _get_sample_list(self): bs = 8 num_feats = 100 max_sentence_len = 25 img_dim = 2048 cls_dim = 3129 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long) image_feat = torch.rand((bs, num_feats, img_dim)) position_ids = (torch.arange( 0, max_sentence_len, dtype=torch.long, device=image_feat.device).unsqueeze(0).expand(bs, -1)) img_pos_feat = torch.rand((bs, num_feats, 7)) attention_mask = torch.zeros((bs, max_sentence_len + num_feats), dtype=torch.long) image_mask = torch.zeros((bs, num_feats), dtype=torch.long) targets = torch.rand((bs, cls_dim)) sample_list = SampleList() sample_list.add_field("input_ids", input_ids) sample_list.add_field("input_mask", input_mask) sample_list.add_field("image_feat", image_feat) sample_list.add_field("img_pos_feat", img_pos_feat) sample_list.add_field("attention_mask", attention_mask) sample_list.add_field("image_mask", image_mask) sample_list.add_field("targets", targets) sample_list.add_field("dataset_name", "test") sample_list.add_field("dataset_type", "test") sample_list.add_field("position_ids", position_ids) sample_list.to(get_current_device()) return sample_list
def _get_sample_list(self): bs = 8 num_feats = 100 max_sentence_len = 25 img_dim = 2048 vqa_cls_dim = 3129 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) input_mask = torch.ones((bs, max_sentence_len), dtype=torch.long) img_feat = torch.rand((bs, num_feats, img_dim)) max_features = torch.ones((bs, num_feats)) * num_feats bbox = torch.randint(50, 200, (bs, num_feats, 4)).float() image_height = torch.randint(100, 300, (bs, )) image_width = torch.randint(100, 300, (bs, )) image_info = { "max_features": max_features, "bbox": bbox, "image_height": image_height, "image_width": image_width, } targets = torch.rand((bs, vqa_cls_dim)) is_correct = torch.ones((bs, ), dtype=torch.long) sample_list = SampleList() sample_list.add_field("input_ids", input_ids) sample_list.add_field("image_feature_0", img_feat) sample_list.add_field("input_mask", input_mask) sample_list.add_field("image_info_0", image_info) sample_list.add_field("targets", targets) sample_list.add_field("is_correct", is_correct) sample_list = sample_list.to(get_current_device()) return sample_list
def __init__(self, dataset_name, config, dataset_type, num_examples, *args, **kwargs): self.num_examples = num_examples self.features = [float(x) for x in range(self.num_examples)] self.annotations = [float(x) for x in range(self.num_examples)] self._device = get_current_device() self._dataset_name = dataset_name
def test_vinvl_for_classification(self): model_for_classification = build_model(self.classification_config) model_for_classification.eval() model_for_classification = model_for_classification.to(get_current_device()) with torch.no_grad(): model_output = model_for_classification(self.sample_list) self.assertTrue("losses" in model_output) self.assertTrue("ce" in model_output["losses"])
def test_current_device(self): config = {"training": {"seed": 1}, "distributed": {"init_method": None}} deviceMock = DeviceMock(OmegaConf.create(config)) deviceMock.configure_seed() deviceMock.configure_device() device = get_current_device() if torch.cuda.is_available(): self.assertEqual(device, "cuda:0") else: self.assertEqual(device, torch.device(type="cpu"))
def __init__(self, dataset_name, config, dataset_type="train", *args, **kwargs): super().__init__() if config is None: config = {} self.config = config self._dataset_name = dataset_name self._dataset_type = dataset_type self._global_config = registry.get("config") self._device = get_current_device() self.use_cuda = "cuda" in str(self._device)
def calculate(self, sample_list, model_output, execute_on_master_only=True, *args, **kwargs): """Calculate detection mean AP (mAP) from the prediction list and the dataset annotations. The function returns COCO-style mAP@IoU=0.50:0.95. Args: sample_list (SampleList): SampleList provided by DataLoader for current iteration. model_output (Dict): Dict returned by model. This should contain "prediction_report" field, which is a list of detection predictions from the model. execute_on_master_only (bool): Whether to only run mAP evaluation on the master node over the gathered detection prediction (to avoid wasting computation and CPU OOM). Default: True (only run mAP evaluation on master). Returns: torch.FloatTensor: COCO-style mAP@IoU=0.50:0.95. """ # as the detection mAP metric is run on the entire dataset-level predictions, # which are *already* gathered from all notes, the evaluation should only happen # in one node and broadcasted to other nodes (to avoid CPU OOM due to concurrent # mAP evaluation) from mmf.utils.distributed import broadcast_tensor, is_master from mmf.utils.general import get_current_device from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval device = get_current_device() if execute_on_master_only and not is_master(): # dummy mAP to be override in boardcasting mAP = torch.tensor(-1, dtype=torch.float, device=device) else: predictions = model_output.prediction_report cocoGt = COCO(self.dataset_json_files[sample_list.dataset_name][ sample_list.dataset_type]) cocoDt = cocoGt.loadRes(predictions) cocoEval = COCOeval(cocoGt, cocoDt, "bbox") cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() mAP = torch.tensor(cocoEval.stats[0], dtype=torch.float, device=device) if execute_on_master_only: mAP = broadcast_tensor(mAP, src=0) return mAP
def compare_torchscript_transformer_models(model, vocab_size): test_sample = Sample() test_sample.input_ids = torch.randint(low=0, high=vocab_size, size=(128,)).long() test_sample.input_mask = torch.ones(128).long() test_sample.segment_ids = torch.zeros(128).long() test_sample.image_feature_0 = torch.rand((1, 100, 2048)).float() test_sample.image = torch.rand((3, 300, 300)).float() test_sample_list = SampleList([test_sample]) model = model.to(get_current_device()) test_sample_list = test_sample_list.to(get_current_device()) with torch.no_grad(): model_output = model(test_sample_list) script_model = torch.jit.script(model) with torch.no_grad(): script_output = script_model(test_sample_list) return torch.equal(model_output["scores"], script_output["scores"])
def test_vinvl_for_pretraining(self): model_for_pretraining = build_model(self.pretraining_config) model_for_pretraining.eval() model_for_pretraining = model_for_pretraining.to(get_current_device()) with torch.no_grad(): model_output = model_for_pretraining(self.sample_list) self.assertTrue("losses" in model_output) self.assertTrue("masked_lm_loss" in model_output["losses"]) self.assertTrue("three_way_contrastive_loss" in model_output["losses"])
def forward(self, image_path: str, text: dict, image_format: str = "path"): text_output = self.processor["text_processor"](text) if image_format == "path": img = np.array(Image.open(image_path)) elif image_format == "url": img = np.array( Image.open(requests.get(image_path, stream=True).raw)) img = torch.as_tensor(img) if self.model_items["config"].image_feature_encodings.type == "frcnn": max_detect = self.model_items[ "config"].image_feature_encodings.params.max_detections image_preprocessed, sizes, scales_yx = self.processor[ "image_processor"](img) image_output = self.feature_extractor( image_preprocessed, sizes=sizes, scales_yx=scales_yx, padding=None, max_detections=max_detect, return_tensors="pt", ) image_output = image_output[0] else: image_preprocessed = self.processor["image_processor"](img) image_output = self.feature_extractor(image_preprocessed) sample = Sample(text_output) sample.image_feature_0 = image_output sample_list = SampleList([sample]) sample_list = sample_list.to(get_current_device()) self.model = self.model.to(get_current_device()) output = self.model(sample_list) sample_list.id = [sample_list.input_ids[0][0]] report = Report(sample_list, output) answers = self.processor["output_processor"](report) answer = self.processor["answer_processor"].idx2word( answers[0]["answer"]) return answer
def setUp(self): self.k = 2 self.batch_size = 64 self.num_tokens = 10 self.embedding_size = 768 self.token_len = 10 self.device = get_current_device() self.encoded_layers = [ torch.randn(self.batch_size, self.token_len, self.embedding_size).to(self.device) for _ in range(3) ] self.pad_mask = torch.randn(self.batch_size, self.token_len).to(self.device)
def test_uniter_for_classification(self): self.model_for_classification.eval() self.model_for_classification = self.model_for_classification.to( get_current_device()) sample_list = self._get_sample_list() sample_list.dataset_name = "vqa2" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.model_for_classification(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("test/vqa2/logit_bce" in model_output["losses"])
def test_uniter_for_pretraining(self): self.model_for_pretraining.eval() self.model_for_pretraining = self.model_for_pretraining.to( get_current_device()) sample_list = self._get_sample_list() sample_list["tasks"] = "wra" sample_list.dataset_name = "vqa2" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.model_for_pretraining(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("wra_loss" in model_output["losses"])
def _test_model_performance(self, model): model = model.to(get_current_device()) result = model.classify("https://i.imgur.com/tEcsk5q.jpg", "look how many people love you") self.assertEqual(result["label"], 0) np.testing.assert_almost_equal(result["confidence"], 0.9993, decimal=3) result = model.classify("https://i.imgur.com/tEcsk5q.jpg", "they have the privilege") self.assertEqual(result["label"], 0) np.testing.assert_almost_equal(result["confidence"], 0.9777, decimal=1) result = model.classify("https://i.imgur.com/tEcsk5q.jpg", "hitler and jews") self.assertEqual(result["label"], 1) np.testing.assert_almost_equal(result["confidence"], 0.8342, decimal=3)
def test_pretrained_model(self): sample_list = SampleList() sample_list.add_field( "input_ids", torch.randint(low=0, high=BERT_VOCAB_SIZE, size=(1, 128)).long(), ) sample_list.add_field("input_mask", torch.ones((1, 128)).long()) sample_list.add_field("segment_ids", torch.zeros(1, 128).long()) sample_list.add_field("image", torch.rand((1, 3, 224, 224)).float()) sample_list.add_field("targets", torch.rand((1, 3129)).float()) self.pretrain_model.eval() self.pretrain_model = self.pretrain_model.to(get_current_device()) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "test" sample_list.dataset_type = "test" with torch.no_grad(): model_output = self.pretrain_model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("test/test/logit_bce" in model_output["losses"])
def test_classification_forward(self): model = VinVLForClassification().to(get_current_device()) model.eval() with torch.no_grad(): model_output = model( input_ids=self.input_ids, img_feats=self.img_feats, attention_mask=self.attention_mask, token_type_ids=self.token_type_ids, labels=self.labels, ) self.assertTrue("losses" in model_output) self.assertTrue("scores" in model_output) self.assertTrue("ce" in model_output["losses"])
def test_uniter_for_pretraining(self): # UNITER pretraining has 5 pretraining tasks, # we have one unique head for each, and in each # forward pass we train on a different task. # In this test we try running a forward pass # through each head. heads = { "mlm": { "type": "mlm" }, "itm": { "type": "itm" }, "mrc": { "type": "mrc" }, "mrfr": { "type": "mrfr" }, "wra": { "type": "wra" }, } tasks = "mlm,itm,mrc,mrfr,wra" mask_probability = 0.15 model = UNITERForPretraining(head_configs=heads, tasks=tasks, mask_probability=mask_probability) model.eval() model = model.to(get_current_device()) sample_list = self._get_sample_list() self._enhance_sample_list_for_pretraining(sample_list) expected_loss_names = { "mlm": "masked_lm_loss", "itm": "itm_loss", "mrc": "mrc_loss", "mrfr": "mrfr_loss", "wra": "wra_loss", } for task_name, loss_name in expected_loss_names.items(): sample_list["task"] = task_name with torch.no_grad(): model_output = model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue(loss_name in model_output["losses"])
def _enhance_sample_list_for_pretraining(self, sample_list): bs = sample_list["input_ids"].size(0) sentence_len = sample_list["input_ids"].size(1) is_correct = torch.ones((bs, ), dtype=torch.long) lm_label_ids = torch.zeros((bs, sentence_len), dtype=torch.long) input_ids_masked = sample_list["input_ids"] num_feat = sample_list["image_feat"].size(1) cls_dim = 1601 image_info = {"cls_prob": torch.rand((bs, num_feat, cls_dim))} sample_list.add_field("is_correct", is_correct) sample_list.add_field("task", "mlm") sample_list.add_field("lm_label_ids", lm_label_ids) sample_list.add_field("input_ids_masked", input_ids_masked) sample_list.add_field("image_info_0", image_info) sample_list.to(get_current_device())
def change_dataloader(self): choice = 0 if self.num_datasets <= 1: self.current_index = choice return if self._is_main: choice = self.iteration_strategy() # self._finished_iterators will always be empty in case of # non-proportional (equal) sampling while self.dataset_list[choice] in self._finished_iterators: choice = self.iteration_strategy() choice = broadcast_scalar(choice, 0, device=get_current_device()) self.current_index = choice
def test_uniter_for_classification(self): heads = {"test": {"type": "mlp", "num_labels": 3129}} tasks = "test" losses = {"test": "logit_bce"} model = UNITERForClassification(head_configs=heads, loss_configs=losses, tasks=tasks) model.eval() model = model.to(get_current_device()) sample_list = self._get_sample_list() with torch.no_grad(): model_output = model(sample_list) self.assertTrue("losses" in model_output) self.assertTrue("test/test/logit_bce" in model_output["losses"])
def classify(self, image: ImageType, text: str): """Classifies a given image and text in it into Hateful/Non-Hateful. Image can be a url or a local path or you can directly pass a PIL.Image.Image object. Text needs to be a sentence containing all text in the image. >>> from mmf.models.mmbt import MMBT >>> model = MMBT.from_pretrained("mmbt.hateful_memes.images") >>> model.classify("some_url", "some_text") {"label": 0, "confidence": 0.56} Args: image (ImageType): Image to be classified text (str): Text in the image Returns: bool: Whether image is hateful (1) or non hateful (0) """ if isinstance(image, str): if image.startswith("http"): temp_file = tempfile.NamedTemporaryFile() download(image, *os.path.split(temp_file.name), disable_tqdm=True) image = tv_helpers.default_loader(temp_file.name) temp_file.close() else: image = tv_helpers.default_loader(image) text = self.processor_dict["text_processor"]({"text": text}) image = self.processor_dict["image_processor"](image) sample = Sample() sample.text = text["text"] if "input_ids" in text: sample.update(text) sample.image = image sample_list = SampleList([sample]) sample_list = sample_list.to(get_current_device()) output = self.model(sample_list) scores = nn.functional.softmax(output["scores"], dim=1) confidence, label = torch.max(scores, dim=1) return {"label": label.item(), "confidence": confidence.item()}
def test_pretraining_forward(self): model = VinVLForPretraining().to(get_current_device()) model.eval() with torch.no_grad(): model_output = model( img_feats=self.img_feats, attention_mask=self.attention_mask, token_type_ids=self.token_type_ids, input_ids_masked=self.input_ids, lm_label_ids=self.lm_label_ids, contrastive_labels=self.contrastive_labels, input_ids_corrupt=self.input_ids, token_type_ids_corrupt=self.token_type_ids, attention_mask_corrupt=self.attention_mask, ) self.assertTrue("losses" in model_output) self.assertTrue("masked_lm_loss" in model_output["losses"]) self.assertTrue("three_way_contrastive_loss" in model_output["losses"])
def change_dataloader(self): choice = 0 if self.num_datasets <= 1: self.current_index = choice return if self._is_master: choice = np.random.choice( self.num_datasets, 1, p=self._dataset_probabilities )[0] # self._finished_iterators will always be empty in case of # non-proportional (equal) sampling while self.dataset_list[choice] in self._finished_iterators: choice = np.random.choice( self.num_datasets, 1, p=self._dataset_probabilities )[0] choice = broadcast_scalar(choice, 0, device=get_current_device()) self.current_index = choice
def _get_sample_list(self): bs = 8 num_feats = 70 class MockObj: pass mock_input = MockObj() mock_vinvl_input_tensors(mock_input, bs=bs, num_feats=num_feats) input_mask = torch.ones_like(mock_input.input_ids) max_features = torch.ones((bs, num_feats)) * num_feats bbox = torch.randint(50, 200, (bs, num_feats, 4)).float() image_height = torch.randint(100, 300, (bs,)) image_width = torch.randint(100, 300, (bs,)) image_info = { "max_features": max_features, "bbox": bbox, "image_height": image_height, "image_width": image_width, } sample_list = SampleList() sample_list.add_field("input_ids", mock_input.input_ids) sample_list.add_field("input_ids_corrupt", mock_input.input_ids) sample_list.add_field("input_ids_masked", mock_input.input_ids) sample_list.add_field("image_feature_0", mock_input.img_feats) sample_list.add_field("image_info_0", image_info) sample_list.add_field("input_mask", input_mask) sample_list.add_field("input_mask_corrupt", input_mask) sample_list.add_field("segment_ids", mock_input.token_type_ids) sample_list.add_field("segment_ids_corrupt", mock_input.token_type_ids) sample_list.add_field("labels", mock_input.labels) sample_list.add_field("contrastive_labels", mock_input.contrastive_labels) sample_list.add_field("lm_label_ids", mock_input.lm_label_ids) sample_list = sample_list.to(get_current_device()) sample_list.dataset_name = "test" sample_list.dataset_type = "test" return sample_list
def __init__(self, trainer): """ Generates a path for saving model which can also be used for resuming from a checkpoint. """ self.trainer = trainer self.config = self.trainer.config self.save_dir = get_mmf_env(key="save_dir") self.model_name = self.config.model self.ckpt_foldername = self.save_dir self.device = get_current_device() self.ckpt_prefix = "" if hasattr(self.trainer.model, "get_ckpt_name"): self.ckpt_prefix = self.trainer.model.get_ckpt_name() + "_" self.pth_filepath = os.path.join( self.ckpt_foldername, self.ckpt_prefix + self.model_name + "_final.pth" ) self.models_foldername = os.path.join(self.ckpt_foldername, "models") if not PathManager.exists(self.models_foldername): PathManager.mkdirs(self.models_foldername) self.save_config() self.repo_path = updir(os.path.abspath(__file__), n=3) self.git_repo = None if git and self.config.checkpoint.save_git_details: try: self.git_repo = git.Repo(self.repo_path) except git.exc.InvalidGitRepositoryError: # Not a git repo, don't do anything pass self.max_to_keep = self.config.checkpoint.max_to_keep self.saved_iterations = []
def test_forward(self): img_feature_dim = 2054 bert_model_name = "bert-base-uncased" use_img_layernorm = True img_layer_norm_eps = 1e-12 bert_config = BertConfig.from_pretrained(bert_model_name) # augment hf BertConfig for vinvl BertImgModel config bert_config.img_feature_dim = img_feature_dim bert_config.use_img_layernorm = use_img_layernorm bert_config.img_layer_norm_eps = img_layer_norm_eps model = VinVLBase(bert_config) model.eval() model = model.to(get_current_device()) bs = 8 num_feats = 70 max_sentence_len = 25 input_ids = torch.ones((bs, max_sentence_len), dtype=torch.long) img_feat = torch.rand((bs, num_feats, img_feature_dim)) with torch.no_grad(): model_output = model(input_ids, img_feat).last_hidden_state self.assertEqual(model_output.shape, torch.Size([8, 95, 768]))