def _do_loss_eval(self): # Copying inference_on_dataset from evaluator.py total = len(self._data_loader) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 losses = [] for idx, inputs in enumerate(self._data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Loss on Validation done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, ) loss_batch = self._get_loss(inputs) losses.append(loss_batch) mean_loss = np.mean(losses) self.trainer.storage.put_scalar('validation_loss', mean_loss) comm.synchronize() return losses
def _do_loss_eval(self) -> float: """ Evaluate the loss function on the validation set. Returns: mean_loss (float): Value of the loss. """ # Copying inference_on_dataset from evaluator.py num_samples: int = len(self._data_loader) self._logger.info("Starting validation on %d samples", num_samples) num_warmup: int = min(5, num_samples - 1) start_time: float = time.perf_counter() total_compute_time: float = 0 losses: List[float] = [] for idx, inputs in enumerate(self._data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 # Inference for these inputs start_compute_time: float = time.perf_counter() loss_batch: float = self._get_loss(inputs) losses.append(loss_batch) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: # Compute average time spent on each image. total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start # Compute ETA eta = datetime.timedelta( seconds=int(total_seconds_per_img * (num_samples - idx - 1))) log_every_n_seconds(lvl=logging.INFO, msg=f"Loss on Validation done {idx + 1}/{num_samples}."\ f" {seconds_per_img:.4f} s / img. ETA={eta}", n=100, name=__name__) # Average the losses. mean_loss = np.mean(losses) # Print the loss value. self._logger.info("Validation loss : {mean_loss}") # Store the loss value for it to be logged and displayed in TensorBoard. self.trainer.storage.put_scalar('validation_loss', mean_loss) comm.synchronize() return mean_loss
def evaluate_loss(self, cfg, model): """Compute and log the validation loss to Comet Args: cfg (CfgNode): Detectron Config Object model (torch.nn.Module): Detectron Model Returns: dict: Empty Dict to satisfy Detectron Eval Hook API requirements """ eval_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0], DatasetMapper(cfg, True)) # Copying inference_on_dataset from evaluator.py total = len(eval_loader) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 losses = [] if comm.is_main_process(): storage = get_event_storage() for idx, inputs in enumerate(eval_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int( idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Loss on Validation done {}/{}. {:.4f} s / img. ETA={}" .format(idx + 1, total, seconds_per_img, str(eta)), n=5, ) loss_batch = self._get_loss(model, inputs) losses.append(loss_batch) mean_loss = np.mean(losses) # Log to Comet self.experiment.log_metric("eval_loss", mean_loss) storage.put_scalar("eval_loss", mean_loss) comm.synchronize() # Returns empty dict to satisfy Dectron Eval Hook requirement return {}
def inference(cfg, out_dir): # build model model = build_model(cfg) # resume DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( "./output/autoaug_post_train/model_final.pth", resume=True) # data_loader mapper = DatasetMapper(cfg, False) data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0], mapper) total = len(data_loader) # inference data loader must have a fixed length num_devices = torch.distributed.get_world_size( ) if torch.distributed.is_initialized() else 1 num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 torch.no_grad() model.eval() for idx, inputs in enumerate(data_loader): start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time # log iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) for input, output in zip(inputs, outputs): pred_segm = output["sem_seg"].to("cpu") pred = torch.max(pred_segm, dim=0)[1].data pred = pred.numpy()[:, :, np.newaxis] pred = np.dstack((pred, pred, pred)) cv2.imwrite( out_dir + input["file_name"].split("/")[-1].replace("jpg", "png"), pred * 255)
def do_test(cfg, model): results = OrderedDict() for dataset_name in cfg.DATASETS.TEST: if cfg.MULTI_DATASET.ENABLED: # TODO: refactor try: model.set_eval_dataset(dataset_name) except: try: model.module.set_eval_dataset(dataset_name) except: print('set eval dataset failed.') data_loader = build_detection_test_loader(cfg, dataset_name) logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = min(len(data_loader), cfg.DUMP_NUM_IMG) start_time = time.perf_counter() model.eval() with torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx > total: break _ = model(inputs) total_seconds_per_img = (time.perf_counter() - start_time) / (idx + 1) eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. ETA={}".format( idx + 1, total, str(eta)), n=5, ) if cfg.DUMP_CLS_SCORE: class_scores = model.roi_heads.class_scores class_scores = [[y.tolist() for y in x] for x in class_scores] json.dump( class_scores, open( '{}/class_scores_{}.json'.format(cfg.OUTPUT_DIR, dataset_name), 'w')) model.roi_heads.class_scores = [] if cfg.DUMP_BBOX: boxes = model.roi_heads.dump_boxes boxes = [[y.tolist() for y in x] for x in boxes] json.dump( boxes, open( '{}/boxes_{}.json'.format(cfg.OUTPUT_DIR, dataset_name), 'w')) model.roi_heads.dump_boxes = [] return
def _do_eval_loss(self, data_loader): total = len(data_loader) with torch.no_grad(): for idx, inputs in enumerate(data_loader): loss_dict = self._model(inputs) # loss_dict_scaled = {k: v * self.weight_dict[k] if k in self.weight_dict else v for k, v in loss_dict.items()} device = next(iter(loss_dict.values())).device with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None): metrics_dict = { 'val_' + k: v.detach().cpu().item() for k, v in loss_dict.items() } all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum( metrics_dict[k] * self.weight_dict[k.split('val_')[-1]] for k in metrics_dict.keys() if k.split('val_')[-1] in self.weight_dict) if not np.isfinite(total_losses_reduced): raise FloatingPointError( f"Loss became infinite or NaN at iteration={idx}!\n" f"loss_dict = {metrics_dict}") if torch.cuda.is_available(): max_mem_mb = torch.cuda.max_memory_allocated( ) / 1024.0 / 1024.0 else: max_mem_mb = None log_every_n_seconds( logging.INFO, msg= " iter: {iter}/{total} val_loss:{val_loss} {losses} {memory}" .format(iter=idx + 1, total=total, val_loss='{:.3f}'.format(total_losses_reduced), losses=" ".join([ "{}: {:.3f}".format( k.split('val_loss_')[-1], v) for k, v in metrics_dict.items() ]), memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else ""), n=5, name=self.logger) storage = get_event_storage() if len(metrics_dict) > 1: storage.put_scalars( total_val_loss=total_losses_reduced, **metrics_dict)
def _log_progress(self, percentage): log_every_n_seconds( logging.INFO, "({:.2f}%) Wrote {} elements to local disk cache, db size: {:.2f} MiB" .format( percentage, len(self._cache.cache), self._cache.cache.volume() / 1024**2, ), n=10, )
def _do_loss_eval(self): # Copying inference_on_dataset from evaluator.py total = len(self._data_loader) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 losses = [] for idx, inputs in enumerate(self._data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Loss on Validation done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, ) loss_batch = self._get_loss(inputs) losses.append(loss_batch) mean_loss = np.mean(losses) tol = 1e10-4 if mean_loss<self.best_val_loss+tol: self.best_val_loss=mean_loss self.waiting=0 print("Saving best model...") self.trainer.checkpointer.save("best_model") print("Model saved") self.waiting+=1 self.trainer.storage.put_scalar('validation_loss', mean_loss) metrics_dict = {k:v[0] for k,v in self.trainer.storage.latest().items()} self.train_process.log_metrics(metrics_dict, self.trainer.iter) if self.waiting>self.patience and self.patience>=0: self.trainer.run=False comm.synchronize() return losses
def inference_on_dataset(model, data_loader, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 # inference_context 临时设置为eval() 模式 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() """ # inputs:list[dict] dict = { { "file_name":图片完全路径 "height": 原始图片高 "width":原始图片宽 "image_id":图片id "image": tensor(N,H,W) # gt_boxes 用Boxes 封装,里面是一个tensor with shape(num,4) "instances":Instances(a class with attr, gt_boxes(Boxes),gt_classes(list[int]),.image_size) # 此.image_size 是经过transfrom 后的 } """ #outputs: list[dict{"instances":Instances}] with shape [batch_size] """ if without nms, shape is [topk] for each attr Instances: .pred_boxes(Boxes): Boxes.tensor with shape[topk,4] # 注意box已经根据原图尺寸进行了调整 .scores(Tensor): shape[topk] .pred_classes(Tensor): prediction of class id """ outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time # 处理输入输出 evaluator.process(inputs, outputs) # total_compute_time 是避开num_warmup 的 # 所以iters_after_start 也要减去num_warmup iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: # eta 预估剩余时间 Estimated Time of Arrival 预计到达时间 total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) """ return results: res = { "AP", "AP50", "AP75", "APs", "APm", "APl", "AP-{#class_name}",... } """ results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset(model, data_loader, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use :class:`DatasetEvaluators([])` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = torch.distributed.get_world_size( ) if torch.distributed.is_initialized() else 1 logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) if idx >= num_warmup * 2: seconds_per_img = total_compute_time / (idx + 1 - num_warmup) eta = datetime.timedelta(seconds=int(seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset(model, data_loader, evaluator, overwrite=True, only_zero_rot=True): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() predictions_save_path = path.join( evaluator._output_dir, f'predictions_{evaluator._dataset_name}.pkl') if not overwrite and path.exists(predictions_save_path): # Load existing predictions if overwrite is false print("Loading existing predictions") #evaluator._predictions = load_obj(predictions_save_path) (evaluator._predictions, evaluator.focussed_comps, evaluator.related_comps, evaluator.unrelated_comps, evaluator.n_comps, evaluator.pred_bboxes_scores, evaluator.unrelated_names, evaluator.focussed_names, evaluator.related_unresolved, evaluator.unrelated_unresolved, evaluator.wide_focus, evaluator.old_related_unresolved, evaluator.old_unrelated_unresolved, evaluator.misboxed_category) = load_obj(predictions_save_path) else: num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): # We only need to evaluate the unrotated images #if inputs[0]['file_name'].endswith('ILTJ110530.36+465055.8_radio_DR2_rotated0deg.png'): # print('input filename') # print(inputs[0]['proposals']) if only_zero_rot and not inputs[0]['file_name'].endswith( '_rotated0deg.png'): continue if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) #missing_box = 'ILTJ123057.73+464446.2_radio_DR2_rotated0deg.png' #if inputs[0]['file_name'].endswith(missing_box): # print('output filename',missing_box) # print('inputs:',inputs) # print('outputs:', outputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time # Appends predicted instances to evaluator._predictions evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int( idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=10, ) # Save to pickle save_obj([ evaluator._predictions, evaluator.focussed_comps, evaluator.related_comps, evaluator.unrelated_comps, evaluator.n_comps, evaluator.pred_bboxes_scores, evaluator.unrelated_names, evaluator.focussed_names, evaluator.related_unresolved, evaluator.unrelated_unresolved, evaluator.wide_focus, evaluator.old_related_unresolved, evaluator.old_unrelated_unresolved, evaluator.misboxed_category ], predictions_save_path) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)" .format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() if not isinstance(results, pd.DataFrame): logger.info( f"LOFAR Evaluation metrics (for all values 0% is best, 100% is worst):" ) logger.info(f"1. Pred. that fail to cover a single comp. source.") logger.info(f"{results['bbox']['assoc_single_fail_fraction']:.2%}") logger.info(f"2. Pred. that fail to cover all comp. of a " \ "multi-comp, source.") logger.info(f"{results['bbox']['assoc_multi_fail_fraction']:.2%}") logger.info( f"3. Pred. that include unassociated comp. for a single comp. source." ) logger.info(f"{results['bbox']['unassoc_single_fail_fraction']:.2%}") logger.info(f"4. Pred. that include unassociated comp. for a " \ "multi-comp. source.") logger.info(f"{results['bbox']['unassoc_multi_fail_fraction']:.2%}") logger.info( f"Catalogue is {results['bbox']['correct_catalogue']} correct.") # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset( model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None] ): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.__call__` accurately. The model will be used in eval mode. Args: model (callable): a callable which takes an object from `data_loader` and returns some outputs. If it's an nn.Module, it will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} batches".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) if isinstance(evaluator, abc.MutableSequence): evaluator = DatasetEvaluators(evaluator) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with ExitStack() as stack: if isinstance(model, nn.Module): stack.enter_context(inference_context(model)) stack.enter_context(torch.no_grad()) for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_iter = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_iter > 5: total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / iter. ETA={}".format( idx + 1, total, seconds_per_iter, str(eta) ), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def eval_mislabel_detection(dataset_name, cfg, mismatch_thresh=0.3, augment=False): data_loader = build_test_loader(cfg, dataset_name) model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) model.eval() n = len(data_loader) tp = torch.zeros(n) fp = torch.zeros(n) total = torch.zeros(n) npos = torch.zeros(n) num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start mislabel evaluation on {} images".format(n)) num_warmup = min(5, n - 1) start_time = time.perf_counter() total_compute_time = 0 for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() gt_mismatch_scores, _, _, gt_mislabeled_ids = detect_mislabeled_annotations_per_image( inputs, model) pred_mislabeled_ids = gt_mismatch_scores > mismatch_thresh total[idx] = gt_mismatch_scores.shape[0] npos[idx] = torch.sum(gt_mislabeled_ids).int() tp[idx] = torch.sum( torch.logical_and(gt_mislabeled_ids, pred_mislabeled_ids)).int() fp[idx] = torch.sum( torch.logical_and(torch.logical_not(gt_mislabeled_ids), pred_mislabeled_ids)).int() total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (n - idx - 1))) log_every_n_seconds( logging.INFO, "Processed {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, n, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total evaluation time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (n - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total evaluation pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (n - num_warmup), num_devices)) # recall = torch.sum(tp)/ torch.sum(npos) # precision = 1 - (torch.sum(tp) + torch.sum(fp))/torch.sum(total) # return recall.item(), precision.item() return torch.sum(tp).item(), torch.sum(fp).item(), torch.sum( npos).item(), torch.sum(total).item()
def detect_mislabeled_annotations(dataset_name, cfg, mismatch_thresh=0.3): class_names = MetadataCatalog.get(dataset_name).thing_classes sa_json_dir = sa_setup_project_dir(dataset_name, class_names) qa = [] completed = [] data_loader = build_test_loader(cfg, dataset_name) model = build_model(cfg) DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) model.eval() n = len(data_loader) num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start mislabel detection on {} images".format(n)) num_warmup = min(5, n - 1) start_time = time.perf_counter() total_compute_time = 0 for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() gt_mismatch_scores, gt_classes, gt_boxes, _ = detect_mislabeled_annotations_per_image( inputs, model) mislabeled_gt_ids = gt_mismatch_scores > mismatch_thresh gt_class_info = [(class_id.item(), class_names[class_id]) for class_id in gt_classes] if torch.any(mislabeled_gt_ids): qa.append(inputs[0]["file_name"]) else: completed.append(inputs[0]["file_name"]) sa_format_annotations(inputs[0]['image_id'], sa_json_dir, gt_boxes, gt_class_info, mislabeled_gt_ids) total_compute_time += time.perf_counter() - start_compute_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (n - idx - 1))) log_every_n_seconds( logging.INFO, "Proessed {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, n, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) logger.info( "Total detection time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (n - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total detection pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (n - num_warmup), num_devices)) sa_write_status_lists(dataset_name, qa, completed)
def inference_on_dataset(model, data_loader, distributed=True, output_dir=None): num_devices = get_world_size() logger = logging.getLogger("detectron2") logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 predictions = [] with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = forward_warpper(model, inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time predictions.extend(process(inputs, outputs)) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, name="detectron2", ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)" .format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) if distributed: comm.synchronize() predictions = comm.gather(predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} if output_dir: PathManager.mkdirs(output_dir) file_path = os.path.join(output_dir, "instances_predictions.pth") logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "wb") as f: torch.save(predictions, f) coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) logger.info( "Start converting obj365 results to coco type annotation json file...") coco_dict = convert_obj365_res_to_coco_json(coco_results) return coco_dict
def inference(model, data_loader, evaluator, k_th, K_fold): total = len( data_loader) # inference data loader must have a fixed length logger = logging.getLogger("detectron2.trainer") logger.info("Start inference on {} images".format(total)) num_devices = torch.distributed.get_world_size( ) if torch.distributed.is_initialized() else 1 # 1.initialize evaluator counter evaluator.reset() num_warmup = min(5 * K_fold, total - 1) start_time = time.perf_counter() total_compute_time = 0 torch.no_grad() model.eval() for idx, inputs in enumerate(data_loader): # warm up if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 # select sub dataset if not idx % K_fold == k_th: continue start_compute_time = time.perf_counter() # 2.evaluate outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time # 3.update evaluator counter evaluator.process(inputs, outputs) # log iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "{}_th sub_datasets | Inference done {}/{}. {:.4f} s / img. ETA={}" .format(k_th, idx + 1, total, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)" .format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) # 4.final evaluate results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset(model, data_loader, tracker, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger("detectron2") logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 res_tracks = dict() with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): # pre process. assert len(inputs) == 1 assert isinstance(inputs[0], tuple) frame_id = inputs[0][0].get("frame_id", None) assert frame_id is not None if frame_id == 1: tracker.reset_all() # warm up for first frame. _, pre_embed = model(inputs) # add pre embed to inputs. inputs[0][0]["pre_embed"] = pre_embed if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 # inference. start_compute_time = time.perf_counter() outputs, pre_embed = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process([inputs[0][0]], outputs) # post process. if frame_id == 1: res_track = tracker.init_track(outputs[0]["instances"]) else: res_track = tracker.step(outputs[0]["instances"]) res_tracks[inputs[0][0]["image_id"]] = res_track iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, name="detectron2") # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results, res_tracks
def gdrn_inference_on_dataset(cfg, model, data_loader, evaluator, amp_test=False): """Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 total_process_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 total_process_time = 0 start_compute_time = time.perf_counter() ############################# # process input batch = batch_data(cfg, inputs, phase="test") if evaluator.train_objs is not None: roi_labels = batch["roi_cls"].cpu().numpy().tolist() obj_names = [evaluator.obj_names[_l] for _l in roi_labels] if all(_obj not in evaluator.train_objs for _obj in obj_names): continue # if cfg.DEBUG: # for i in range(len(batch["roi_cls"])): # vis_roi_im = batch["roi_img"][i].cpu().numpy().transpose(1,2,0)[:, :, ::-1] # show_ims = [vis_roi_im] # show_titles = ["roi_im"] # # vis_coor2d = batch["roi_coord_2d"][i].cpu().numpy() # show_ims.extend([vis_coor2d[0], vis_coor2d[1]]) # show_titles.extend(["coord_2d_x", "coord_2d_y"]) # grid_show(show_ims, show_titles, row=1, col=3) with autocast(enabled=amp_test): out_dict = model( batch["roi_img"], roi_classes=batch["roi_cls"], roi_cams=batch["roi_cam"], roi_whs=batch["roi_wh"], roi_centers=batch["roi_center"], resize_ratios=batch["resize_ratio"], roi_coord_2d=batch.get("roi_coord_2d", None), roi_extents=batch.get("roi_extent", None), ) if torch.cuda.is_available(): torch.cuda.synchronize() cur_compute_time = time.perf_counter() - start_compute_time total_compute_time += cur_compute_time # NOTE: added # TODO: add detection time here outputs = [{} for _ in range(len(inputs))] for _i in range(len(outputs)): outputs[_i]["time"] = cur_compute_time start_process_time = time.perf_counter() evaluator.process(inputs, outputs, out_dict) # RANSAC/PnP cur_process_time = time.perf_counter() - start_process_time total_process_time += cur_process_time iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, f"Inference done {idx+1}/{total}. {seconds_per_img:.4f} s / img. ETA={str(eta)}", n=5) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( f"Total inference time: {total_time_str} " f"({total_time / (total - num_warmup):.6f} s / img per device, on {num_devices} devices)" ) # pure forward time total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) # post_process time total_process_time_str = str( datetime.timedelta(seconds=int(total_process_time))) logger.info( "Total inference post process time: {} ({:.6f} s / img per device, on {} devices)" .format(total_process_time_str, total_process_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # results is always None # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset(model, data_loader, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = torch.distributed.get_world_size( ) if torch.distributed.is_initialized() else 1 logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() # with forward aggregation the model does not return the output for the current input frame inputs_buffer = deque() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 assert (len(inputs) == 1), "Test batch size != 1({})".format( len(inputs)) if not inputs[0]['is_padding']: # skip left frame padding (repeated frames at the beginning of each video) # skip right frame padding (repeated frames at the end of each video): # - This way, inputs_buffer will be empty at the end of each video. With # each frame in the right padding we remove one actual input from the buffer. inputs_buffer.append(inputs) start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time # incomplete iteration: # - processing padding at the beginning/end of the current video # or # - more frames are needed to perform the forward aggregation # therefore: # - discard outputs(==None) # - do not take into account execution time for incomplete iterations if outputs is not None: # the current output is related with the first input in the inputs_buffer inputs = inputs_buffer.popleft() evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int( idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) else: print(idx, 'padding') # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() print(type(evaluator)) # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset(model, data_loader, evaluator): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() # perf profiling prof_type = os.getenv('DETECTRON2_PROF', None) def prof_func(): return torch.autograd.profiler.profile(use_cuda=prof_type == 'cuda') if prof_type is not None: prof_key = '{}_time_total'.format(prof_type if prof_type == 'cpu' else 'cuda') prof_logger = logging.getLogger( 'detectron2_prof_test_{}'.format(prof_type)) prof_logger.setLevel(logging.INFO) prof_logger.addHandler( logging.FileHandler( './detectron2_prof_test_{}.log'.format(prof_type), 'w')) num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 # perf profiling: add timer for pre, and post-processing timer = [0, 0] loading_start, loading_time = time.perf_counter(), 0 with inference_context(model), torch.no_grad(), prof_func() as prof: for idx, inputs in enumerate(data_loader): loading_time += time.perf_counter() - loading_start if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs, timer=timer) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) loading_start = time.perf_counter() # perf profiling logging if prof_type is not None: prof_logger.info(prof.key_averages().table(sort_by=prof_key)) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) logger.info( "Pre-processing time: {:.2f} s, Post-processing time: {:.2f}".format( timer[0], timer[1])) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} results['pre_processing_time'] = timer[0] results['post_processing_time'] = timer[1] results['loading_time'] = loading_time results['compute_time'] = total_compute_time results['inference_time'] = total_time return results
def inference_custom(model, data_loader, evaluator): num_devices = get_world_size() logger = logging.getLogger(__name__) logger.info("Start inference on {} images".format(len(data_loader))) total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 with inference_context(model), torch.no_grad(): for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() outputs = model(inputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta)), n=5, ) # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)". format(total_time_str, total_time / (total - num_warmup), num_devices)) total_compute_time_str = str( datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)" .format(total_compute_time_str, total_compute_time / (total - num_warmup), num_devices)) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def inference_on_dataset( model, data_loader, evaluator, num_classes, topk, num_estimate, min_score ): """ Run model on the data_loader and evaluate the metrics with evaluator. Also benchmark the inference speed of `model.forward` accurately. The model will be used in eval mode. Args: model (nn.Module): a module which accepts an object from `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. If you wish to evaluate a model in `training` mode instead, you can wrap the given model and override its behavior of `.eval()` and `.train()`. data_loader: an iterable object with a length. The elements it generates will be the inputs to the model. evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want to benchmark, but don't want to do any evaluation. topk (int) num_estimate (int): Number of images to estimate initial score threshold. Returns: The return value of `evaluator.evaluate()` """ num_devices = get_world_size() logger.info("Start inference on {} images".format(len(data_loader))) if isinstance(topk, int): logger.info(f"Collecting top-{topk} images.") topk = [topk] * num_classes else: logger.info(f"Collecting top-k images. Counts:\n{topk}") total = len(data_loader) # inference data loader must have a fixed length if evaluator is None: # create a no-op evaluator evaluator = DatasetEvaluators([]) evaluator.reset() num_warmup = min(5, total - 1) start_time = time.perf_counter() total_compute_time = 0 # We keep track of scores from _this_ process (process_scores) and scores from # all processes (scores). Every iter, each process updates process_scores and its # local scores with the new scores from the model. # Every few iterations, all processes pass their process_scores to each other and # updates their own global scores. # Map category id to min-heap of top scores from this process. process_scores = defaultdict(list) # Map category id to min-heap of top scores from all processes. global_scores = defaultdict(list) init_thresholds = torch.full( (num_classes + 1,), fill_value=min_score, dtype=torch.float32 ).to(model.device) init_threshold_path = Path(evaluator._output_dir) / "_thresholds_checkpoint.pth" if init_threshold_path.exists(): logger.info("Loading thresholds from disk.") init_thresholds = torch.load(init_threshold_path).to(model.device) else: init_threshold_path.parent.mkdir(exist_ok=True, parents=True) # Trying to get exactly the top-k estimates can result in getting slightly fewer # than K estimates. This can happen due to subtle differences in the model's forward # pass in the first phase vs. the second phase. For example, in the first phase, # when we have low thresholds, D2 will use torchvision.ops.boxes.batched_nms for # batch NMS. In phase 2, D2 will use a slightly different, customized # implementation, which may occasionally result in fewer boxes. # To address this, we set thresholds to be a bit looser, targeting 10% more # predictions than requested. topk_loose = [int(ceil(k * 1.1)) for k in topk] def get_thresholds(scores, min_thresholds): thresholds = [] for i in range(num_classes): if topk_loose[i] == 0: thresholds.append(float("inf")) elif len(scores[i]) < topk_loose[i]: thresholds.append(-1) else: thresholds.append(scores[i][0]) # Add -1 for background thresholds = torch.FloatTensor(thresholds + [-1]).to(model.device) # Clamp at minimum thresholds return torch.max(thresholds, init_thresholds) def update_scores(scores, inputs, outputs): updated = set() for image, output in zip(inputs, outputs): if isinstance(output, dict): instances = output["instances"] else: instances = output curr_labels = instances.pred_classes.int().tolist() curr_scores = instances.scores.cpu().tolist() for label, score in zip(curr_labels, curr_scores): # label = label.int().item() # scores[label].append((image["image_id"], score.cpu().item())) if len(scores[label]) >= topk_loose[label]: if score < scores[label][0]: continue else: heapq.heappushpop(scores[label], score) else: heapq.heappush(scores[label], score) updated.add(label) def gather_scores(process_scores): # List of scores per process scores_list = comm.all_gather(process_scores) gathered = defaultdict(list) labels = {x for scores in scores_list for x in scores.keys()} for label in labels: # Sort in descending order. sorted_generator = heapq.merge( *[sorted(x[label], reverse=True) for x in scores_list], reverse=True ) top_k = itertools.islice(sorted_generator, topk_loose[label]) top_k_ascending = list(reversed(list(top_k))) # Return to ascending order heapq.heapify(top_k_ascending) gathered[label] = top_k_ascending return gathered with inference_context(model), torch.no_grad(): ######### # Phase 1: Compute initial, low score thresholds without mask branch. ######### # First, get an estimate of score thresholds with the mask branch off. # Otherwise, in the initial few images, we will run the mask branch on a bunch # of useless proposals which makes everything slow. num_estimate = min(num_estimate, len(data_loader)) for idx, inputs in enumerate( tqdm( data_loader, desc="Computing score thresholds", total=num_estimate, disable=comm.get_rank() != 0, ) ): if idx > num_estimate: break # Gather scores from other processes periodically. # In early iterations, the thresholds are low, making inference slow and # gather relatively fast, so we gather more often. # Later, the thresholds are high enough that inference is fast and gathering # is slow, so we stop gathering. if (idx < 100 and idx % 10 == 0) or (idx % 500 == 0): global_scores = gather_scores(process_scores) thresholds = get_thresholds(global_scores, init_thresholds) if idx % 1000 == 0: # Save thresholds for later runs torch.save(thresholds, init_threshold_path) with per_class_thresholded_inference(model, thresholds, topk): with _turn_off_roi_heads(model, ["mask_on", "keypoint_on"]): outputs = model.inference(inputs, do_postprocess=False) update_scores(global_scores, inputs, outputs) update_scores(process_scores, inputs, outputs) if (idx < 100 and idx % 10 == 0) or (idx % 100 == 0): logger.info( "Threshold range (%s, %s); # collected: (%s, %s)", thresholds[:-1].min(), thresholds[:-1].max(), min(len(x) for x in global_scores.values()), max(len(x) for x in global_scores.values()), ) del global_scores # Necessary to avoid timeout when gathering? comm.synchronize() # Map class to scores of predictions so far. init_scores = gather_scores(process_scores) # Minimum thresholds from the estimate stage init_thresholds = get_thresholds(init_scores, init_thresholds) # Clear scores from estimates; we will start tracking them again. scores = defaultdict(list) ######### # Phase 2: Collect top-k predictions, with mask branch enabled. ######### for idx, inputs in enumerate(data_loader): if idx == num_warmup: start_time = time.perf_counter() total_compute_time = 0 start_compute_time = time.perf_counter() thresholds = get_thresholds(scores, init_thresholds) with per_class_thresholded_inference(model, thresholds, topk): with limit_mask_branch_proposals(model, max_proposals=300): outputs = model(inputs) update_scores(scores, inputs, outputs) if torch.cuda.is_available(): torch.cuda.synchronize() total_compute_time += time.perf_counter() - start_compute_time evaluator.process(inputs, outputs) iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) seconds_per_img = total_compute_time / iters_after_start if idx >= num_warmup * 2 or seconds_per_img > 5: total_seconds_per_img = ( time.perf_counter() - start_time ) / iters_after_start eta = datetime.timedelta( seconds=int(total_seconds_per_img * (total - idx - 1)) ) log_every_n_seconds( logging.INFO, "Inference done {}/{}. {:.4f} s / img. ETA={}".format( idx + 1, total, seconds_per_img, str(eta) ), n=5, name=logger.name, ) # Clear unnecessary predictions every so often. if idx < 100 or ((idx + 1) % 10) == 0: by_cat = defaultdict(list) for pred in evaluator._predictions: for ann in pred["instances"]: by_cat[ann["category_id"]].append(ann) topk_preds = [] for c, anns in by_cat.items(): topk_preds.extend( sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]] ) evaluator._predictions = [{"instances": topk_preds}] if evaluator._output_dir: PathManager.mkdirs(evaluator._output_dir) file_path = os.path.join( evaluator._output_dir, f"instances_predictions_rank{comm.get_rank()}.pth" ) with PathManager.open(file_path, "wb") as f: torch.save(evaluator._predictions, f) # Necessary to avoid timeout when gathering? comm.synchronize() # Limit number of detections per category across workers. predictions = comm.gather(evaluator._predictions, dst=0) if comm.is_main_process(): predictions = list(itertools.chain(*predictions)) by_cat = defaultdict(list) for pred in predictions: for ann in pred["instances"]: by_cat[ann["category_id"]].append(ann) logger.info(f"Max per cat: {max([len(v) for v in by_cat.values()])}") logger.info(f"Min per cat: {min([len(v) for v in by_cat.values()])}") topk_preds = [] for c, anns in by_cat.items(): topk_preds.extend( sorted(anns, key=lambda a: a["score"], reverse=True)[: topk[c]] ) evaluator._predictions = [{"instances": topk_preds}] else: evaluator._predictions = [] # Measure the time only for this worker (before the synchronization barrier) total_time = time.perf_counter() - start_time total_time_str = str(datetime.timedelta(seconds=total_time)) # NOTE this format is parsed by grep logger.info( "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( total_time_str, total_time / (total - num_warmup), num_devices ) ) total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) logger.info( "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( total_compute_time_str, total_compute_time / (total - num_warmup), num_devices, ) ) results = evaluator.evaluate() # An evaluator may return None when not in main process. # Replace it by an empty dict instead to make it easier for downstream code to handle if results is None: results = {} return results
def main( cfg, output_dir, runner=None, is_train=True, ): setup_after_launch(cfg, output_dir, runner) if is_train: data_loader = runner.build_detection_train_loader(cfg) else: assert len(cfg.DATASETS.TEST) > 0, cfg.DATASETS.TEST data_loader = runner.build_detection_test_loader( cfg, cfg.DATASETS.TEST[0]) TOTAL_BENCHMARK_TIME = (100 if get_launch_environment() == "local" else 600 ) # run benchmark for 10 min LOGGING_METER_WINDOW_SIZE = 20 LOGGING_METER_TIME_INTERVAL = 5 WARMUP_ITERS = 5 # initialize time_per_iter = HistoryBuffer(max_length=10000) total_time = 0 start = time.time() for no, batch in enumerate(data_loader): data_time = time.time() - start time_per_iter.update(data_time) total_time += data_time if no == 0: logger.info("Show the first batch as example:\n{}".format(batch)) # Assume batch size is constant batch_size = cfg.SOLVER.IMS_PER_BATCH // comm.get_world_size() assert len(batch) * batch_size median = time_per_iter.median(window_size=LOGGING_METER_WINDOW_SIZE) avg = time_per_iter.avg(window_size=LOGGING_METER_WINDOW_SIZE) log_every_n_seconds( logging.INFO, "iter: {};" " recent per-iter seconds: {:.4f} (avg) {:.4f} (median);" " recent per-image seconds: {:.4f} (avg) {:.4f} (median).".format( no, avg, median, avg / batch_size, median / batch_size, ), n=LOGGING_METER_TIME_INTERVAL, ) # Synchronize between processes, exit when all processes are running for enough # time. This mimic the loss.backward(), the logged time doesn't include the time # for synchronize. finished = comm.all_gather(total_time >= TOTAL_BENCHMARK_TIME) if all(x for x in finished): logger.info( "Benchmarking finished after {} seconds".format(total_time)) break start = time.time() dataset_name = ":".join( cfg.DATASETS.TRAIN) if is_train else cfg.DATASETS.TEST[0] time_per_iter = [x[0] for x in time_per_iter.values()] time_per_iter = time_per_iter[ min(WARMUP_ITERS, max(len(time_per_iter) - WARMUP_ITERS, 0)):] results = { "environment": { "num_workers": cfg.DATALOADER.NUM_WORKERS, "world_size": comm.get_world_size(), "processes_per_machine": get_num_processes_per_machine(), }, "main_processes_stats": { "batch_size_per_process": batch_size, "per_iter_avg": np.average(time_per_iter), "per_iter_p1": np.percentile(time_per_iter, 1, interpolation="nearest"), "per_iter_p10": np.percentile(time_per_iter, 10, interpolation="nearest"), "per_iter_p50": np.percentile(time_per_iter, 50, interpolation="nearest"), "per_iter_p90": np.percentile(time_per_iter, 90, interpolation="nearest"), "per_iter_p99": np.percentile(time_per_iter, 99, interpolation="nearest"), "per_image_avg": np.average(time_per_iter) / batch_size, "per_image_p1": np.percentile(time_per_iter, 1, interpolation="nearest") / batch_size, "per_image_p10": np.percentile(time_per_iter, 10, interpolation="nearest") / batch_size, "per_image_p50": np.percentile(time_per_iter, 50, interpolation="nearest") / batch_size, "per_image_p90": np.percentile(time_per_iter, 90, interpolation="nearest") / batch_size, "per_image_p99": np.percentile(time_per_iter, 99, interpolation="nearest") / batch_size, }, "data_processes_stats": {}, # TODO: add worker stats } # Metrics follows the hierarchy of: name -> dataset -> task -> metrics -> number metrics = {"_name_": {dataset_name: results}} print_metrics_table(metrics) return { "accuracy": metrics, "metrics": metrics, }