def _update_and_create_report( self, batch: Dict, batch_idx: int, step_output: Dict, pl_module: LightningModule, combined_report: Report = None, update_meter: Meter = None, ): report = Report(batch, step_output) if update_meter: update_meter.update_from_report(report) should_accumulate = not ( batch_idx % self.trainer_config.accumulate_grad_batches == 0) final_report = report if should_accumulate and combined_report is not None: combined_report.accumulate_tensor_fields_and_loss( report, pl_module.metrics.required_params) combined_report.batch_size += report.batch_size final_report = combined_report return final_report
def prediction_loop(self, dataset_type: str) -> None: reporter = self.dataset_loader.get_test_reporter(dataset_type) skipped_batches = 0 loaded_batches = 0 with torch.no_grad(): self.model.eval() logger.info(f"Starting {dataset_type} inference predictions") while reporter.next_dataset(): dataloader = reporter.get_dataloader() if self._can_use_tqdm(dataloader): dataloader = tqdm.tqdm(dataloader) for batch in dataloader: with CompleteInTimeOrDie(600): prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) loaded_batches += 1 if not validate_batch_sizes(prepared_batch.get_batch_size()): logger.info("Skip batch due to unequal batch sizes.") skipped_batches += 1 continue with torch.cuda.amp.autocast(enabled=self.training_config.fp16): model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) reporter.add_to_report(report, self.model) reporter.postprocess_dataset_report() logger.info(f"Finished predicting. Loaded {loaded_batches}") logger.info(f" -- skipped {skipped_batches} batches.") self.model.train()
def _build_report(self): tensor_a = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]]) sample_list = SampleList() sample_list.add_field("a", tensor_a) model_output = {"scores": torch.rand(2, 2)} report = Report(sample_list, model_output) return report
def validation_step(self, batch: SampleList, batch_idx: int, *args, **kwargs): """Member function of PL modules. Used only when PL enabled. To be implemented by child class. Takes in a ``SampleList``, batch_idx and returns back a dict. Args: sample_list (SampleList): SampleList returned by the DataLoader for current iteration Returns: Dict """ output = self._forward_lightning_step(batch, batch_idx) report = Report(batch, output) self.val_meter.update_from_report(report) report.metrics = self.metrics(report, report) return output
def _forward(self, batch: Tensor) -> Dict[str, Any]: prepared_batch = self.dataset_loader.prepare_batch(batch) self.profile("Batch prepare time") # Arguments should be a dict at this point model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) self.profile("Forward time") return report
def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx): # TODO(asg): Ask Sasha to investigate what is happening here in depth if "losses" in outputs: output = outputs else: output = outputs[0][0]["extra"] report = Report(output["input_batch"], output) self.lightning_losses.append(report["losses"]["loss"].item())
def evaluation_loop( self, dataset_type: str, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() reporter = self.dataset_loader.get_test_reporter(dataset_type) with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_master() while reporter.next_dataset(flush_report=False): dataloader = reporter.get_dataloader() combined_report = None for batch in tqdm.tqdm(dataloader, disable=disable_tqdm): prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) self.update_meter(report, meter) # accumulate necessary params for metric calculation if combined_report is None: # make a copy of report since `reporter.add_to_report` will # change some of the report keys later combined_report = Report(report) else: combined_report.accumulate_tensor_fields_and_loss( report, self.metrics.required_params) combined_report.batch_size += report.batch_size # Each node generates a separate copy of predict JSON from the report, # which will be used to evaluate dataset-level metrics # (such as mAP in object detection or CIDEr in image captioning) # Since `reporter.add_to_report` changes report keys (e.g. scores), # do this after `combined_report.accumulate_tensor_fields_and_loss` if "__prediction_report__" in self.metrics.required_params: reporter.add_to_report(report, self.model, execute_on_master_only=False) if single_batch is True: break reporter.postprocess_dataset_report() # add prediction_report is used for set-level metrics combined_report.prediction_report = reporter.report combined_report.metrics = self.metrics(combined_report, combined_report) self.update_meter(combined_report, meter, eval_mode=True) # enable train mode again self.model.train() return combined_report, meter
def _forward(self, batch: Tensor) -> Dict[str, Any]: prepared_batch = self.dataset_loader.prepare_batch(batch) # Move the sample list to device if it isn't as of now. prepared_batch = to_device(prepared_batch, torch.device("cuda")) self.profile("Batch prepare time") # Arguments should be a dict at this point model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) self.profile("Forward time") return report
def _forward(self, batch: Dict[str, Tensor]) -> Dict[str, Any]: # Move the sample list to device if it isn't as of now. prepared_batch = to_device(batch, self.device) self.profile("Batch prepare time") # Arguments should be a dict at this point with torch.cuda.amp.autocast(enabled=self.training_config.fp16): model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) self.profile("Forward time") return report
def test_meter_update_from_report(self): meter = Meter() prepared_batch = SampleList( {"targets": torch.tensor([1, 2, 3, 4]), "dataset_type": "val"} ) for idx in range(5): model_output = { "scores": torch.tensor([0, 1, 2, 3]), "losses": {"loss": float(idx)}, } report = Report(prepared_batch, model_output) meter.update_from_report(report) self.assertEqual(meter.loss.global_avg, 2.0) self.assertEqual(meter.loss.avg, 2.0)
def test_argmax_prediction_processor(self): processor = ArgMaxPredictionProcessor(config={}) batch = SampleList( {"id": torch.tensor([1, 2, 3, 4, 5], dtype=torch.long)}) model_output = {"scores": torch.rand(5, 4)} report = Report(batch, model_output) predictions = processor(report) expected_answers = [1, 1, 2, 1, 3] expected = [] for idx, answer in enumerate(expected_answers): expected.append({"id": idx + 1, "answer": answer}) self.assertEqual(predictions, expected)
def prediction_loop(self, dataset_type: str) -> None: reporter = self.dataset_loader.get_test_reporter(dataset_type) with torch.no_grad(): self.model.eval() logger.info(f"Starting {dataset_type} inference predictions") while reporter.next_dataset(): dataloader = reporter.get_dataloader() for batch in tqdm.tqdm(dataloader): prepared_batch = reporter.prepare_batch(batch) model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) reporter.add_to_report(report, self.model) logger.info("Finished predicting") self.model.train()
def predict(self, dataset_type): reporter = self.dataset_loader.get_test_reporter(dataset_type) with torch.no_grad(): self.model.eval() message = f"Starting {dataset_type} inference predictions" self.writer.write(message) while reporter.next_dataset(): dataloader = reporter.get_dataloader() for batch in tqdm(dataloader): prepared_batch = reporter.prepare_batch(batch) model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) reporter.add_to_report(report, self.model) self.writer.write("Finished predicting") self.model.train()
def prediction_loop(self, dataset_type: str) -> None: reporter = self.dataset_loader.get_test_reporter(dataset_type) with torch.no_grad(): self.model.eval() logger.info(f"Starting {dataset_type} inference predictions") while reporter.next_dataset(): dataloader = reporter.get_dataloader() for batch in tqdm.tqdm(dataloader): prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, torch.device("cuda")) with torch.cuda.amp.autocast(enabled=self.training_config.fp16): model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) reporter.add_to_report(report, self.model) logger.info("Finished predicting") self.model.train()
def forward(self, image_path: str, text: dict, image_format: str = "path"): text_output = self.processor["text_processor"](text) if image_format == "path": img = np.array(Image.open(image_path)) elif image_format == "url": img = np.array( Image.open(requests.get(image_path, stream=True).raw)) img = torch.as_tensor(img) if self.model_items["config"].image_feature_encodings.type == "frcnn": max_detect = self.model_items[ "config"].image_feature_encodings.params.max_detections image_preprocessed, sizes, scales_yx = self.processor[ "image_processor"](img) image_output = self.feature_extractor( image_preprocessed, sizes=sizes, scales_yx=scales_yx, padding=None, max_detections=max_detect, return_tensors="pt", ) image_output = image_output[0] else: image_preprocessed = self.processor["image_processor"](img) image_output = self.feature_extractor(image_preprocessed) sample = Sample(text_output) sample.image_feature_0 = image_output sample_list = SampleList([sample]) sample_list = sample_list.to(get_current_device()) self.model = self.model.to(get_current_device()) output = self.model(sample_list) sample_list.id = [sample_list.input_ids[0][0]] report = Report(sample_list, output) answers = self.processor["output_processor"](report) answer = self.processor["answer_processor"].idx2word( answers[0]["answer"]) return answer
def evaluation_loop( self, dataset_type: str, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() reporter = self.dataset_loader.get_test_reporter(dataset_type) use_cpu = self.config.evaluation.get("use_cpu", False) loaded_batches = 0 skipped_batches = 0 with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_master() while reporter.next_dataset(flush_report=False): dataloader = reporter.get_dataloader() combined_report = None if self._can_use_tqdm(dataloader): dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm) for batch in dataloader: # Do not timeout quickly on first batch, as workers might start at # very different times. with CompleteInTimeOrDie(600 if loaded_batches else 3600 * 24): loaded_batches += 1 prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) if not validate_batch_sizes( prepared_batch.get_batch_size()): logger.info( "Skip batch due to uneven batch sizes.") skipped_batches += 1 continue model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) report = report.detach() meter.update_from_report(report) moved_report = report # Move to CPU for metrics calculation later if needed # Explicitly use `non_blocking=False` as this can cause # race conditions in next accumulate if use_cpu: moved_report = report.copy().to("cpu", non_blocking=False) # accumulate necessary params for metric calculation if combined_report is None: # make a copy of report since `reporter.add_to_report` will # change some of the report keys later combined_report = moved_report.copy() else: combined_report.accumulate_tensor_fields_and_loss( moved_report, self.metrics.required_params) combined_report.batch_size += moved_report.batch_size # Each node generates a separate copy of predict JSON from the # report, which will be used to evaluate dataset-level metrics # (such as mAP in object detection or CIDEr in image captioning) # Since `reporter.add_to_report` changes report keys, # (e.g scores) do this after # `combined_report.accumulate_tensor_fields_and_loss` if "__prediction_report__" in self.metrics.required_params: # Still need to use original report here on GPU/TPU since # it will be gathered reporter.add_to_report( report, self.model, execute_on_master_only=False) if single_batch is True: break logger.info(f"Finished training. Loaded {loaded_batches}") logger.info(f" -- skipped {skipped_batches} batches.") reporter.postprocess_dataset_report() assert (combined_report is not None ), "Please check if your validation set is empty!" # add prediction_report is used for set-level metrics combined_report.prediction_report = reporter.report combined_report.metrics = self.metrics(combined_report, combined_report) # Since update_meter will reduce the metrics over GPUs, we need to # move them back to GPU but we will only move metrics and losses # which are needed by update_meter to avoid OOM # Furthermore, do it in a non_blocking way to avoid any issues # in device to host or host to device transfer if use_cpu: combined_report = combined_report.to( self.device, fields=["metrics", "losses"], non_blocking=False) meter.update_from_report(combined_report, should_update_loss=False) # enable train mode again self.model.train() return combined_report, meter
def on_train_batch_end( self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx ): output = outputs[0][0]["extra"] report = Report(output["input_batch"], output) self.lightning_losses.append(report["losses"]["loss"].item())