def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): is_main_process = comm.is_main_process() super().__init__( model, save_dir, save_to_disk=is_main_process if save_to_disk is None else save_to_disk, **checkpointables, )
def do_test(cfg, model): for dataset_name in cfg.DATASETS.TEST: # data_loader = build_detection_test_loader(cfg, dataset_name) if 'build_detection_test_loader': if dataset_name == 'coco_2017_val': dicts_valid: List[Dict] = DatasetCatalog.get(dataset_name) if "filter_empty and has_instances": ... ds_valid = DatasetFromList(dicts_valid, copy=False) mapper = DatasetMapper(cfg, is_train=False) else: # Open-Image-Dataset if 'get_detection_dataset_dicts': descs_get: List[Dict] = DatasetCatalog.get(dataset_name) # validation dataset is too large. random.seed(2020) descs_valid = random.choices(descs_get, k=N_IMAGES_PER_TEST) # TODO: clear cache. ds_valid = DatasetFromList(descs_valid) if 'DatasetMapper': mapper = make_mapper(dataset_name, is_train=False, augmentations=None) ds_valid = MapDataset(ds_valid, mapper) sampler = InferenceSampler(len(ds_valid)) # Always use 1 image per worker during inference since this is the # standard when reporting inference time in papers. batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) data_loader = torch.utils.data.DataLoader( ds_valid, num_workers=cfg.DATALOADER.NUM_WORKERS, batch_sampler=batch_sampler, collate_fn=trivial_batch_collator, ) evaluator = get_evaluator2( cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) ) results_i = inference_on_dataset(model, data_loader, evaluator) if comm.is_main_process(): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) # print_csv_format(results_i) for tsk, res in results_i.items(): res_df = pd.DataFrame(pd.Series(res, name='value')) res_df = res_df[res_df['value'].notna()] res_df.index = res_df.index.map(lambda x: '/'.join(x.split('/')[1:])) pd.set_option('display.max_rows', None) print(res_df) pd.reset_option('display.max_rows')
def evaluate(self): if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions if len(predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} coco_results = list( itertools.chain(*[x["instances"] for x in predictions])) # print("*************************") # print("coco_results:", coco_results) # print("*************************") PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "text_results.json") self._logger.info("Saving results to {}".format(file_path)) with PathManager.open(file_path, "w") as f: f.write(json.dumps(coco_results)) f.flush() self._results = OrderedDict() # eval text temp_dir = "temp_det_results/" self.to_eval_format(file_path, temp_dir, self._text_eval_confidence) result_path = self.sort_detection(temp_dir) text_result = self.evaluate_with_official_code(result_path, self._text_eval_gt_path) os.remove(result_path) # parse template = "(\S+): (\S+): (\S+), (\S+): (\S+), (\S+): (\S+), (\S+): (\S+)" for task in ("e2e_method", "det_only_method"): result = text_result[task] groups = re.match(template, result).groups() self._results[groups[0]] = { groups[i * 2 + 1]: float(groups[(i + 1) * 2]) for i in range(4) } return copy.deepcopy(self._results)
def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train()
def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ self.train_loop(self.start_iter, self.max_iter) if hasattr(self, "_last_eval_results") and comm.is_main_process(): verify_results(self.cfg, self._last_eval_results) print("*************************") print("self._last_eval_results:", self._last_eval_results) print("*************************") return self._last_eval_results
def evaluate(self): comm.synchronize() self._predictions = comm.gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return # PanopticApi requires local files gt_json = PathManager.get_local_path(self._metadata.panoptic_json) gt_folder = PathManager.get_local_path(self._metadata.panoptic_root) with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: logger.info( "Writing all panoptic predictions to {} ...".format(pred_dir)) for p in self._predictions: with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: f.write(p.pop("png_string")) with open(gt_json, "r") as f: json_data = json.load(f) json_data["annotations"] = self._predictions with PathManager.open(self._predictions_json, "w") as f: f.write(json.dumps(json_data)) from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( gt_json, PathManager.get_local_path(self._predictions_json), gt_folder=gt_folder, pred_folder=pred_dir, ) res = {} res["PQ"] = 100 * pq_res["All"]["pq"] res["SQ"] = 100 * pq_res["All"]["sq"] res["RQ"] = 100 * pq_res["All"]["rq"] res["PQ_th"] = 100 * pq_res["Things"]["pq"] res["SQ_th"] = 100 * pq_res["Things"]["sq"] res["RQ_th"] = 100 * pq_res["Things"]["rq"] res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] results = OrderedDict({"panoptic_seg": res}) _print_panoptic_results(pq_res) return results
def test(cls, cfg, model, evaluators=None): """ Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as `cfg.DATASETS.TEST`. Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len( cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators)) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method.") results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i) logger.info("Evaluation results for {} in csv format:".format( dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr( self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results
def _write_metrics(self, loss_dict: Dict[str, torch.Tensor], data_time: float): """ Args: loss_dict (dict): dict of scalar losses data_time (float): time taken by the dataloader iteration """ device = next(iter(loss_dict.values())).device # Use a new stream so these ops don't wait for DDP or backward with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None): metrics_dict = { k: v.detach().cpu().item() for k, v in loss_dict.items() } metrics_dict["data_time"] = data_time # Gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in detectron2. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): storage = get_event_storage() # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) storage.put_scalar("data_time", data_time) # average the rest metrics try: metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } except: # pdb.set_trace() print(all_metrics_dict[0].keys()) print(all_metrics_dict) total_losses_reduced = sum(metrics_dict.values()) if not np.isfinite(total_losses_reduced): raise FloatingPointError( f"Loss became infinite or NaN at iteration={self.iter}!\n" f"loss_dict = {metrics_dict}") storage.put_scalar("total_loss", total_losses_reduced) if len(metrics_dict) > 1: storage.put_scalars(**metrics_dict)
def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. # if comm.is_main_process(): # ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_to_keep=20)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. # ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) ret.append( PeriodicCheckpointerWithEval(cfg.TEST.EVAL_PERIOD, test_and_save_results, self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_to_keep=3)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers())) return ret
def main(args): # register_coco_instances('asparagus_train', {'_background_': 0, 'clump': 1, 'stalk': 2, 'spear': 3, 'bar': 4} , "./datasets/coco/annotations/train/annotations.json", "./datasets/coco/annotations/train") # register_coco_instances('asparagus_val', {'_background_': 0, 'clump': 1, 'stalk': 2, 'spear': 3, 'bar': 4} , "./datasets/coco/annotations/test_458/annotations.json", "./datasets/coco/annotations/test_458") register_coco_instances( 'asparagus_train', { '_background_': 0, 'clump': 1, 'stalk': 2, 'spear': 3, 'bar': 4, 'straw': 5 }, "./datasets/coco/annotations/straw/train/annotations.json", "./datasets/coco/annotations/straw/train") register_coco_instances( 'asparagus_val', { '_background_': 0, 'clump': 1, 'stalk': 2, 'spear': 3, 'bar': 4, 'straw': 5 }, "./datasets/coco/annotations/val_straw/val/annotations.json", "./datasets/coco/annotations/val_straw/val") cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) if comm.is_main_process(): verify_results(cfg, res) return res """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop (see plain_train_net.py) or subclassing the trainer. """ trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) if cfg.TEST.AUG.ENABLED: trainer.register_hooks([ hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) ]) return trainer.train()
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". """ all_predictions = comm.gather(self._predictions, dst=0) if not comm.is_main_process(): return predictions = defaultdict(list) for predictions_per_rank in all_predictions: for clsid, lines in predictions_per_rank.items(): predictions[clsid].extend(lines) del all_predictions self._logger.info( "Evaluating {} using {} metric. " "Note that results do not use the official Matlab API.".format( self._dataset_name, 2007 if self._is_2007 else 2012)) with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: res_file_template = os.path.join(dirname, "{}.txt") aps = defaultdict(list) # iou -> ap per class for cls_id, cls_name in enumerate(self._class_names): lines = predictions.get(cls_id, [""]) with open(res_file_template.format(cls_name), "w") as f: f.write("\n".join(lines)) thresh = 50 APs = voc_eval( res_file_template, self._anno_file_template, self._image_set_path, cls_name, ovthresh=thresh / 100.0, use_07_metric=self._is_2007, ) ret = OrderedDict() ret["APs"] = { "hand": APs["hand"]["ap"], "No Contact": APs["no_contact"]["ap"], "Self Contact": APs["self_contact"]["ap"], "Other Person Contact": APs["other_person_contact"]["ap"], "Object Contact": APs["object_contact"]["ap"], "Mean Contact AP": APs["mAP_contact"] } return ret
def evaluate( self, output_filename: Optional[str] = None ) -> "OrderedDict[str, Dict[str, Any]]": mses: List[float] # if distributed, gather and sum correct answers if self._distributed: comm.synchronize() mses_lists: List[List[float]] = comm.gather(self._mses, dst=0) if not comm.is_main_process(): return OrderedDict() mses = sum(mses_lists, []) # List[List[float]] -> List[float] else: mses = self._mses mse_tensor: torch.Tensor = torch.as_tensor( mses, dtype=torch.float, device=torch.device("cpu") ) del mses total_mse: float = mse_tensor.sum().item() # saving total mse + histogram if self._output_dir: if output_filename is None: output_filename = f"{self._task_name}_mse_evaluation.json" json_dict: Dict[str, Any] = {"mse": total_mse} if self._n_bins > 1: mn: float = mse_tensor.min().item() mx: float = mse_tensor.max().item() mse_hist: torch.Tensor = torch.histc( mse_tensor, bins=self._n_bins, min=mn, max=mx ) mse_bins: torch.Tensor = torch.linspace( start=mn, end=mx, steps=self._n_bins ) json_dict["hist_counts"] = mse_hist.tolist() json_dict["hist_bins"] = mse_bins.tolist() PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, output_filename) with PathManager.open(file_path, "w") as f: json.dump(json_dict, f) # collect and return results results: "OrderedDict[str, Dict[str, float]]" = OrderedDict( [(f"mse_{self._task_name}", {f"mse": total_mse})] ) self._logger.info(results) return results
def init_pretrained_weights(key): """Initializes model with pretrained weights. Layers that don't match with pretrained layers in name or size are kept unchanged. """ import os import errno import gdown def _get_torch_home(): ENV_TORCH_HOME = 'TORCH_HOME' ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' DEFAULT_CACHE_DIR = '~/.cache' torch_home = os.path.expanduser( os.getenv( ENV_TORCH_HOME, os.path.join( os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'torch' ) ) ) return torch_home torch_home = _get_torch_home() model_dir = os.path.join(torch_home, 'checkpoints') try: os.makedirs(model_dir) except OSError as e: if e.errno == errno.EEXIST: # Directory already exists, ignore. pass else: # Unexpected OSError, re-raise. raise filename = model_urls[key].split('/')[-1] cached_file = os.path.join(model_dir, filename) if not os.path.exists(cached_file): if comm.is_main_process(): gdown.download(model_urls[key], cached_file, quiet=False) comm.synchronize() logger.info(f"Loading pretrained model from {cached_file}") state_dict = torch.load(cached_file, map_location=torch.device('cpu'))['model_state'] return state_dict
def evaluate(self, img_ids=None): if self._distributed: synchronize() predictions = gather(self._predictions) predictions = list(itertools.chain(*predictions)) else: predictions = self._predictions multi_storage = storage_gather( self._storage) if self._storage is not None else None if not is_main_process(): return return copy.deepcopy( self._eval_predictions(predictions, multi_storage, img_ids))
def evaluate(self): if self._distributed: comm.synchronize() predictions = comm.gather(self._predictions, dst=0) predictions = list(itertools.chain(*predictions)) if not comm.is_main_process(): return {} else: predictions = self._predictions self._results = OrderedDict() self._results['val'] = {'acc': np.mean([x for x in predictions])} return copy.deepcopy(self._results)
def build_hooks(self): cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) # def test_and_save_results(): # self._last_eval_results = self.test(self.cfg, self.model) # return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. # ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret
def do_test(cfg, model): results = OrderedDict() for dataset_name in cfg.DATASETS.TEST: data_loader = build_detection_test_loader(cfg, dataset_name) evaluator = get_evaluator( cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) ) results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
def setup_myargs_for_multiple_processing(myargs): from detectron2.utils import comm distributed = comm.get_world_size() > 1 if distributed and comm.is_main_process(): # setup logging in the project logfile = myargs.args.logfile logging_utils.get_logger(filename=logfile, logger_names=['template_lib', 'tl'], stream=True) logger = logging.getLogger('tl') myargs.logger = logger myargs.stdout = sys.stdout myargs.stderr = sys.stderr logging_utils.redirect_print_to_logger(logger=logger) return myargs
def build_hooks(self): cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] if comm.is_main_process(): ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): res = self._last_eval_results = self.test(self.cfg, self.model) eval_dir = os.path.join(self.cfg.OUTPUT_DIR, 'evals') os.makedirs(eval_dir, exist_ok=True) pd.DataFrame(res).to_csv(os.path.join(eval_dir, f'{self.round}.csv')) return self._last_eval_results ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". """ all_predictions = comm.gather(self._predictions, dst=0) if not comm.is_main_process(): return predictions = defaultdict(list) for predictions_per_rank in all_predictions: for clsid, lines in predictions_per_rank.items(): predictions[clsid].extend(lines) del all_predictions self._logger.info( "Evaluating {} using {} metric. " "Note that results do not use the official Matlab API.".format( self._dataset_name, 2007 if self._is_2007 else 2012)) with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: res_file_template = os.path.join(dirname, "{}.txt") aps = defaultdict(list) # iou -> ap per class for cls_id, cls_name in enumerate(self._class_names): lines = predictions.get(cls_id, [""]) with open(res_file_template.format(cls_name), "w") as f: f.write("\n".join(lines)) for thresh in range(50, 100, 5): rec, prec, ap = voc_eval( res_file_template, self._anno_file_template, self._image_set_path, cls_name, ovthresh=thresh / 100.0, use_07_metric=self._is_2007, ) aps[thresh].append(ap * 100) ret = OrderedDict() mAP = {iou: np.mean(x) for iou, x in aps.items()} ret["bbox"] = { "AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75] } return ret
def evaluate(self): """ Evaluates Referring Segmentation IoU: """ if self._distributed: synchronize() self._predictions = all_gather(self._predictions) if not is_main_process(): return all_prediction = {} for p in self._predictions: all_prediction.update(p) else: all_prediction = self._predictions image_unique_ids = list(all_prediction.keys()) all_mIoU = [] all_inter = [] all_union = [] all_mIoU_bg = [] for img_sent_id in image_unique_ids: result = all_prediction[img_sent_id] all_mIoU.append(result[0]) all_mIoU_bg.append(result[1]) all_inter.append(result[2]) all_union.append(result[3]) MIoU = np.array(all_mIoU).mean() MIoU_bg = np.array(all_mIoU_bg).mean() OverIoU = np.array(all_inter).sum() / np.array(all_union).sum() if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "prediction.pkl") with PathManager.open(file_path, "wb") as f: pickle.dump(all_prediction, f) self._logger.info('evaluation on {} expression instances'.format( len(image_unique_ids))) results = OrderedDict({ "MeanIoU": MIoU, "OverIoU": OverIoU, "MeanIoU_bg": MIoU_bg }) return results
def after_step(self): data = next(self._loader) with torch.no_grad(): loss_dict = self.trainer.model(data) losses = sum(loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = { "val_" + k: v.item() for k, v in comm.reduce_dict(loss_dict).items() } losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): self.trainer.storage.put_scalars(total_val_loss=losses_reduced, **loss_dict_reduced)
def do_evaluate(cfg, model): """ Evaluate on test set using coco evaluate """ results = OrderedDict() dataset_name = cfg.DATASETS.TEST[1] data_loader = build_detection_test_loader(cfg, dataset_name) evaluator = COCOEvaluator(dataset_name, cfg, False, output_dir= cfg.OUTPUT_DIR) results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
def main(args): cfg = setup(args) register_hair( ) # this is some customized logic to register our hair datasets if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) return trainer.train()
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the detectron2 logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (CfgNode): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() setup_logger(output_dir, distributed_rank=rank, name="fvcore") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) # if comm.is_main_process() and output_dir: # # Note: some of our scripts may expect the existence of # # config.yaml in output directory # path = os.path.join(output_dir, "config.yaml") # with PathManager.open(path, "w") as f: # f.write(cfg.dump()) # logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK ## dl_lib codes base_config = cfg.__class__.__base__() return cfg, logger
def build_detection_train_loader(cfg): """Builds a data loader for the baseline trainer with support of training on the subset of labeled data only. Most of code comes from `d2.data.build.build_detection_train_loader()`, see it for more details. """ # CSD: check config is supported assert cfg.DATALOADER.SAMPLER_TRAIN == "TrainingSampler", "Unsupported training sampler: {}".format( cfg.DATALOADER.SAMPLER_TRAIN) # Original code dataset = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) # CSD: subsample the dataset if needed dataset = check_subsample_dataset(dataset, cfg) if comm.is_main_process(): # Log counts logger = setup_logger(name=__name__) logger.debug("Number of images in the dataset: {}".format( len(dataset))) _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) # Original code mapper = DatasetMapper(cfg, True) sampler = TrainingSampler(len(dataset)) dataset = DatasetFromList(dataset, copy=False) dataset = MapDataset(dataset, mapper) sampler = TrainingSampler(len(dataset)) assert isinstance(sampler, torch.utils.data.sampler.Sampler) return build_batch_data_loader( dataset, sampler, cfg.SOLVER.IMS_PER_BATCH, aspect_ratio_grouping=cfg.DATALOADER.ASPECT_RATIO_GROUPING, num_workers=cfg.DATALOADER.NUM_WORKERS, )
def main(args): cfg = setup(args) if args.eval_only: model = Trainer.build_model(cfg) AdetCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) # d2 defaults.py if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) return res """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the trainer. """ trainer = Trainer(cfg)
def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) ############################## # NOTE: pop some unwanterd configs in detectron2 cfg.SOLVER.pop("STEPS", None) cfg.SOLVER.pop("MAX_ITER", None) # NOTE: get optimizer from string cfg dict if cfg.SOLVER.OPTIMIZER_CFG != "": optim_cfg = eval(cfg.SOLVER.OPTIMIZER_CFG) iprint("optimizer_cfg:", optim_cfg) cfg.SOLVER.OPTIMIZER_NAME = optim_cfg['type'] cfg.SOLVER.BASE_LR = optim_cfg['lr'] cfg.SOLVER.MOMENTUM = optim_cfg.get("momentum", 0.9) cfg.SOLVER.WEIGHT_DECAY = optim_cfg.get("weight_decay", 1e-4) if cfg.get("DEBUG", False): iprint("DEBUG") args.num_gpus = 1 args.num_machines = 1 cfg.DATALOADER.NUM_WORKERS = 0 cfg.TRAIN.PRINT_FREQ = 1 if cfg.TRAIN.get("VERBOSE", False): cfg.TRAIN.PRINT_FREQ = 1 # register datasets dataset_names = cfg.DATASETS.TRAIN + cfg.DATASETS.TEST register_datasets(dataset_names) cfg.RESUME = args.resume ########################################## cfg.freeze() default_setup(cfg, args) setup_for_distributed(is_master=comm.is_main_process()) rank = comm.get_rank() setup_my_logger(cfg.OUTPUT_DIR, distributed_rank=rank, name="adet") setup_my_logger(cfg.OUTPUT_DIR, distributed_rank=rank, name="core") return cfg
def _write_metrics( self, loss_dict: Dict[str, torch.Tensor], data_time: float, prefix: str = "", ): """ Args: loss_dict (dict): dict of scalar losses data_time (float): time taken by the dataloader iteration """ metrics_dict = { k: v.detach().cpu().item() for k, v in loss_dict.items() } metrics_dict["data_time"] = data_time # Gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in detectron2. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): storage = get_event_storage() # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(metrics_dict.values()) if not np.isfinite(total_losses_reduced): raise FloatingPointError( f"Loss became infinite or NaN at iteration={self.iter}!\n" f"loss_dict = {metrics_dict}") storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced) if len(metrics_dict) > 1: storage.put_scalars(**metrics_dict)