def _update_and_create_report( self, batch: Dict, batch_idx: int, step_output: Dict, pl_module: LightningModule, combined_report: Report = None, update_meter: Meter = None, ): report = Report(batch, step_output) if update_meter: update_meter.update_from_report(report) should_accumulate = not ( batch_idx % self.trainer_config.accumulate_grad_batches == 0) final_report = report if should_accumulate and combined_report is not None: combined_report.accumulate_tensor_fields_and_loss( report, pl_module.metrics.required_params) combined_report.batch_size += report.batch_size final_report = combined_report return final_report
def test_meter_update_from_report(self): meter = Meter() prepared_batch = SampleList( {"targets": torch.tensor([1, 2, 3, 4]), "dataset_type": "val"} ) for idx in range(5): model_output = { "scores": torch.tensor([0, 1, 2, 3]), "losses": {"loss": float(idx)}, } report = Report(prepared_batch, model_output) meter.update_from_report(report) self.assertEqual(meter.loss.global_avg, 2.0) self.assertEqual(meter.loss.avg, 2.0)
def evaluation_loop( self, dataset_type: str, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() reporter = self.dataset_loader.get_test_reporter(dataset_type) use_cpu = self.config.evaluation.get("use_cpu", False) loaded_batches = 0 skipped_batches = 0 with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_master() while reporter.next_dataset(flush_report=False): dataloader = reporter.get_dataloader() combined_report = None if self._can_use_tqdm(dataloader): dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm) for batch in dataloader: # Do not timeout quickly on first batch, as workers might start at # very different times. with CompleteInTimeOrDie(600 if loaded_batches else 3600 * 24): loaded_batches += 1 prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) if not validate_batch_sizes( prepared_batch.get_batch_size()): logger.info( "Skip batch due to uneven batch sizes.") skipped_batches += 1 continue model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) report = report.detach() meter.update_from_report(report) moved_report = report # Move to CPU for metrics calculation later if needed # Explicitly use `non_blocking=False` as this can cause # race conditions in next accumulate if use_cpu: moved_report = report.copy().to("cpu", non_blocking=False) # accumulate necessary params for metric calculation if combined_report is None: # make a copy of report since `reporter.add_to_report` will # change some of the report keys later combined_report = moved_report.copy() else: combined_report.accumulate_tensor_fields_and_loss( moved_report, self.metrics.required_params) combined_report.batch_size += moved_report.batch_size # Each node generates a separate copy of predict JSON from the # report, which will be used to evaluate dataset-level metrics # (such as mAP in object detection or CIDEr in image captioning) # Since `reporter.add_to_report` changes report keys, # (e.g scores) do this after # `combined_report.accumulate_tensor_fields_and_loss` if "__prediction_report__" in self.metrics.required_params: # Still need to use original report here on GPU/TPU since # it will be gathered reporter.add_to_report( report, self.model, execute_on_master_only=False) if single_batch is True: break logger.info(f"Finished training. Loaded {loaded_batches}") logger.info(f" -- skipped {skipped_batches} batches.") reporter.postprocess_dataset_report() assert (combined_report is not None ), "Please check if your validation set is empty!" # add prediction_report is used for set-level metrics combined_report.prediction_report = reporter.report combined_report.metrics = self.metrics(combined_report, combined_report) # Since update_meter will reduce the metrics over GPUs, we need to # move them back to GPU but we will only move metrics and losses # which are needed by update_meter to avoid OOM # Furthermore, do it in a non_blocking way to avoid any issues # in device to host or host to device transfer if use_cpu: combined_report = combined_report.to( self.device, fields=["metrics", "losses"], non_blocking=False) meter.update_from_report(combined_report, should_update_loss=False) # enable train mode again self.model.train() return combined_report, meter
def evaluation_loop( self, dataset_type: str, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() reporter = self.dataset_loader.get_test_reporter(dataset_type) with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_master() while reporter.next_dataset(flush_report=False): dataloader = reporter.get_dataloader() combined_report = None if self._can_use_tqdm(dataloader): dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm) for batch in dataloader: prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) meter.update_from_report(report) # accumulate necessary params for metric calculation if combined_report is None: # make a copy of report since `reporter.add_to_report` will # change some of the report keys later combined_report = Report(report) else: combined_report.accumulate_tensor_fields_and_loss( report, self.metrics.required_params) combined_report.batch_size += report.batch_size # Each node generates a separate copy of predict JSON from the # report, which will be used to evaluate dataset-level metrics # (such as mAP in object detection or CIDEr in image captioning) # Since `reporter.add_to_report` changes report keys (e.g. scores), # do this after `combined_report.accumulate_tensor_fields_and_loss` if "__prediction_report__" in self.metrics.required_params: reporter.add_to_report(report, self.model, execute_on_master_only=False) if single_batch is True: break reporter.postprocess_dataset_report() assert (combined_report is not None ), "Please check if your validation set is empty!" # add prediction_report is used for set-level metrics combined_report.prediction_report = reporter.report combined_report.metrics = self.metrics(combined_report, combined_report) meter.update_from_report(combined_report, should_update_loss=False) # enable train mode again self.model.train() return combined_report, meter