def main(args, config, build_model): config.merge_from_list(args.opts) cfg = default_setup(config, args) """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the runner. """ runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model) runner.resume_or_load(resume=args.resume) extra_hooks = [] if args.clearml: from cvpods.engine.clearml import ClearMLHook if comm.is_main_process(): extra_hooks.append(ClearMLHook()) if cfg.TEST.AUG.ENABLED: extra_hooks.append( hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model))) if extra_hooks: runner.register_hooks(extra_hooks) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.diff(base_config))) runner.train() if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN: # convert last ckpt to pretrain format convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR, "model_final.pth"), save_path=os.path.join( cfg.OUTPUT_DIR, "model_final_pretrain_weight.pkl"))
def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg # cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.OptimizationHook( accumulate_grad_steps=cfg.SOLVER.BATCH_SUBDIVISIONS, grad_clipper=None, mixed_precision=cfg.TRAINER.FP16.ENABLED), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.IterationTimer(), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=self.max_iter, max_epoch=self.max_epoch)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # Here the default print/log frequency of each writer is used. # run writers in the end, so that evaluation metrics are written ret.append( hooks.PeriodicWriter(self.build_writers(), period=self.cfg.GLOBAL.LOG_INTERVAL)) # Put `PeriodicDumpLog` after writers so that can dump all the files, # including the files generated by writers return ret
def stage_main(args, cfg, build): cfg.merge_from_list(args.opts) cfg, logger = default_setup(cfg, args) model_build_func = build """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the trainer. """ trainer = Trainer(cfg, model_build_func) trainer.resume_or_load(resume=args.resume) if args.eval_only: DefaultCheckpointer(trainer.model, save_dir=cfg.OUTPUT_DIR, resume=args.resume).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, trainer.model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, trainer.model)) return res # check wheather worksapce has enough storeage space # assume that a single dumped model is 700Mb file_sys = os.statvfs(cfg.OUTPUT_DIR) free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30 eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10 if eval_space_Gb > free_space_Gb: logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) " f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}") if cfg.TEST.AUG.ENABLED: trainer.register_hooks([ hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) ]) trainer.train() if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN: # convert last ckpt to pretrain format convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR, "model_final.pth"), save_path=os.path.join( cfg.OUTPUT_DIR, "model_final_pretrain_weight.pkl"))
def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append( hooks.PeriodicWriter(self.build_writers(), period=self.cfg.GLOBAL.LOG_INTERVAL)) return ret
def _write_metrics(self, metrics_dict: dict): """ Args: metrics_dict (dict): dict of scalar metrics """ metrics_dict = { k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v) for k, v in metrics_dict.items() } # gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in cvpods. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): if "data_time" in all_metrics_dict[0]: # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max( [x.pop("data_time") for x in all_metrics_dict]) self.storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(loss for key, loss in metrics_dict.items() if "loss" in key) self.storage.put_scalar("total_loss", total_losses_reduced) if len(metrics_dict) > 1: self.storage.put_scalars(**metrics_dict)
def __init__(self, model, save_dir="", resume=False, *, save_to_disk=None, **checkpointables): """ Args: model (nn.Module): model. save_dir (str): a directory to save and find checkpoints. resume (bool): indicate whether to resume from latest checkpoint or start from scratch. save_to_disk (bool): if True, save checkpoint to disk, otherwise disable saving for this checkpointer. checkpointables (object): any checkpointable objects, i.e., objects that have the `state_dict()` and `load_state_dict()` method. For example, it can be used like `Checkpointer(model, "dir", optimizer=optimizer)`. """ is_main_process = comm.is_main_process() super().__init__( model, save_dir, resume, save_to_disk=is_main_process if save_to_disk is None else save_to_disk, **checkpointables, )
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: ensure_dir(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with megfile.smart_open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def main(args): config.merge_from_list(args.opts) cfg, logger = default_setup(config, args) if args.debug: batches = int(cfg.SOLVER.IMS_PER_DEVICE * args.num_gpus) if cfg.SOLVER.IMS_PER_BATCH != batches: cfg.SOLVER.IMS_PER_BATCH = batches logger.warning( "SOLVER.IMS_PER_BATCH is changed to {}".format(batches)) valid_files = get_valid_files(args, cfg, logger) # * means all if need specific format then *.csv for current_file in valid_files: cfg.MODEL.WEIGHTS = current_file model = build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) if cfg.TEST.AUG.ENABLED: res = Trainer.test_with_TTA(cfg, model) else: res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res)
def evaluate_files(self): """ Only evaluate files without inference """ if self._distributed: comm.synchronize() if not comm.is_main_process(): return del self._predictions if self._output_dir: file_path = os.path.join(self._output_dir, "instances_predictions.pth") self._predictions = torch.load(file_path) logger.info("Read predictions from {}".format(file_path)) else: logger.warning( "Stored predictions is None, you need to run the inference_on_dataset" ) raise NotImplementedError self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) self._targets = comm.gather(self._targets, dst=0) self._targets = list(itertools.chain(*self._targets)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning("[ClassificationEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() assert len(self._predictions) == len(self._targets) if self._predictions[0] is not None: self._eval_classification_accuracy() if self._dump: _dump_to_markdown(self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". """ all_predictions = comm.gather(self._predictions, dst=0) if not comm.is_main_process(): return predictions = defaultdict(list) for predictions_per_rank in all_predictions: for clsid, lines in predictions_per_rank.items(): predictions[clsid].extend(lines) del all_predictions self._logger.info( "Evaluating {} using {} metric. " "Note that results do not use the official Matlab API.".format( self._dataset_name, 2007 if self._is_2007 else 2012)) with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: res_file_template = os.path.join(dirname, "{}.txt") aps = defaultdict(list) # iou -> ap per class for cls_id, cls_name in enumerate(self._class_names): lines = predictions.get(cls_id, [""]) with open(res_file_template.format(cls_name), "w") as f: f.write("\n".join(lines)) for thresh in range(50, 100, 5): rec, prec, ap = voc_eval( res_file_template, self._anno_file_template, self._image_set_path, cls_name, ovthresh=thresh / 100.0, use_07_metric=self._is_2007, ) aps[thresh].append(ap * 100) ret = OrderedDict() mAP = {iou: np.mean(x) for iou, x in aps.items()} ret["bbox"] = { "AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75] } small_table = create_small_table(ret["bbox"]) self._logger.info("Evaluation results for bbox: \n" + small_table) if self._dump: dump_info_one_task = { "task": "bbox", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return ret
def test(cls, cfg, model, evaluators=None, output_folder=None): """ Args: cfg (config dict): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as ``cfg.DATASETS.TEST``. Returns: dict: a dict of result metrics """ if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len( cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators)) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator( cfg, dataset_name, data_loader.dataset, output_folder=output_folder) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultRunner.test(evaluators=)`, " "or implement its `build_evaluator` method.") results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) if cfg.TEST.ON_FILES: results_i = inference_on_files(evaluator) else: results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i) logger.info("Evaluation results for {} in csv format:".format( dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
def evaluate(self): comm.synchronize() self._predictions = comm.gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return # gt_json = PathManager.get_local_path(self._metadata.panoptic_json) gt_json = self._metadata.panoptic_json gt_folder = self._metadata.panoptic_root with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: logger.info( "Writing all panoptic predictions to {} ...".format(pred_dir)) for p in self._predictions: with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: f.write(p.pop("png_string")) with open(gt_json, "r") as f: json_data = json.load(f) json_data["annotations"] = self._predictions with megfile.smart_open(self._predictions_json, "w") as f: f.write(json.dumps(json_data)) from panopticapi.evaluation import pq_compute with contextlib.redirect_stdout(io.StringIO()): pq_res = pq_compute( gt_json, self._predictions_json, gt_folder=gt_folder, pred_folder=pred_dir, ) res = {} res["PQ"] = 100 * pq_res["All"]["pq"] res["SQ"] = 100 * pq_res["All"]["sq"] res["RQ"] = 100 * pq_res["All"]["rq"] res["PQ_th"] = 100 * pq_res["Things"]["pq"] res["SQ_th"] = 100 * pq_res["Things"]["sq"] res["RQ_th"] = 100 * pq_res["Things"]["rq"] res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] results = OrderedDict({"panoptic_seg": res}) table = _print_panoptic_results(pq_res) if self._dump: dump_info_one_task = { "task": "panoptic_seg", "tables": [table], } _dump_to_markdown([dump_info_one_task]) return results
def main(args): config.merge_from_list(args.opts) cfg, logger = default_setup(config, args) if args.debug: batches = int(cfg.SOLVER.IMS_PER_BATCH / 8 * args.num_gpus) if cfg.SOLVER.IMS_PER_BATCH != batches: cfg.SOLVER.IMS_PER_BATCH = batches logger.warning( "SOLVER.IMS_PER_BATCH is changed to {}".format(batches)) if "MODEL.WEIGHTS" in args.opts: if cfg.MODEL.WEIGHTS.endswith(".pth") and not PathManager.exists( cfg.MODEL.WEIGHTS): ckpt_name = cfg.MODEL.WEIGHTS.split("/")[-1] model_prefix = cfg.OUTPUT_DIR.split("cvpods_playground")[1][1:] remote_file_path = os.path.join(cfg.OSS.DUMP_PREFIX, model_prefix, ckpt_name) logger.warning( f"The specified ckpt file ({cfg.MODEL.WEIGHTS}) was not found locally," f" try to load the corresponding dump file on OSS ({remote_file_path})." ) cfg.MODEL.WEIGHTS = remote_file_path valid_files = [cfg.MODEL.WEIGHTS] else: list_of_files = glob.glob(os.path.join(cfg.OUTPUT_DIR, '*.pth')) assert list_of_files, "No checkpoint file found in {}.".format( cfg.OUTPUT_DIR) list_of_files.sort(key=os.path.getctime) latest_file = list_of_files[-1] if not args.end_iter: valid_files = [latest_file] else: files = [f for f in list_of_files if str(f) <= str(latest_file)] valid_files = [] for f in files: try: model_iter = int(re.split(r'(model_|\.pth)', f)[-3]) except Exception: logger.warning("remove {}".format(f)) continue if args.start_iter <= model_iter <= args.end_iter: valid_files.append(f) assert valid_files, "No .pth files satisfy your requirement" # * means all if need specific format then *.csv for current_file in valid_files: cfg.MODEL.WEIGHTS = current_file model = build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model))
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the cvpods logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (BaseConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() # setup_logger(output_dir, distributed_rank=rank, name="cvpods") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) adjust_config(cfg) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.diff(base_config))) # if comm.is_main_process() and output_dir: # # Note: some of our scripts may expect the existence of # # config.yaml in output directory # path = os.path.join(output_dir, "config.yaml") # with PathManager.open(path, "w") as f: # f.write(cfg.dump()) # logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # save seed to config for dump cfg.SEED = seed # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK return cfg, logger
def evaluate(self): results = OrderedDict() for evaluator in self._evaluators: result = evaluator.evaluate() if comm.is_main_process() and result is not None: for k, v in result.items(): assert ( k not in results ), "Different evaluators produce results with the same key {}".format(k) results[k] = v return results
def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter, self.max_epoch) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results
def do_test(cfg, model): results = OrderedDict() for dataset_name in cfg.DATASETS.TEST: data_loader = build_test_loader(cfg, dataset_name) evaluator = get_evaluator( cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) ) results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): logger.info("Evaluation results for {} in csv format:".format(dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
def _write_metrics( self, loss_dict: Dict[str, torch.Tensor], data_time: float, prefix: str = "", ): """ Args: loss_dict (dict): dict of scalar losses data_time (float): time taken by the dataloader iteration """ device = next(iter(loss_dict.values())).device # Use a new stream so these ops don't wait for DDP or backward with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None): metrics_dict = { k: v.detach().cpu().item() for k, v in loss_dict.items() } metrics_dict["data_time"] = data_time # Gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in cvpods. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): storage = get_event_storage() # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(loss for key, loss in metrics_dict.items() if "loss" in key) storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced) if len(metrics_dict) > 1: storage.put_scalars(**metrics_dict)
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the cvpods logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (BaseConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: ensure_dir(output_dir) rank = comm.get_rank() # setup_logger(output_dir, distributed_rank=rank, name="cvpods") setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, megfile.smart_open(args.config_file, "r").read())) adjust_config(cfg) # make sure each worker has a different, yet deterministic seed if specified seed = seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # save seed to config for dump cfg.SEED = seed # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK return cfg
def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ if self.max_epoch is None: logger.info("Starting training from iteration {}".format( self.start_iter)) else: logger.info("Starting training from epoch {}".format( self.start_epoch)) super().train(self.start_iter, self.start_epoch, self.max_iter) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results
def evaluate(self): if self._distributed: comm.synchronize() self._predictions = comm.gather(self._predictions, dst=0) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return {} if len(self._predictions) == 0: self._logger.warning( "[COCOEvaluator] Did not receive valid predictions.") return {} if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "instances_predictions.pth") with PathManager.open(file_path, "wb") as f: torch.save(self._predictions, f) self._results = OrderedDict() if "proposals" in self._predictions[0]: self._eval_box_proposals() if "instances" in self._predictions[0]: self._eval_predictions(set(self._tasks)) if self._dump: extra_infos = { "title": os.path.basename(os.getcwd()), "seed": self.cfg.SEED, } _dump_to_markdown(extra_infos, self._dump_infos) # Copy so the caller can do whatever with results return copy.deepcopy(self._results)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DefaultCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)
def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: comm.synchronize() conf_matrix_list = comm.all_gather(self._conf_matrix) self._predictions = comm.all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.zeros(self._num_classes, dtype=np.float) iou = np.zeros(self._num_classes, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] macc = np.sum(acc) / np.sum(acc_valid) miou = np.sum(iou) / np.sum(iou_valid) fiou = np.sum(iou * class_weights) pacc = np.sum(tp) / np.sum(pos_gt) res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou res["mACC"] = 100 * macc res["pACC"] = 100 * pacc if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) small_table = create_small_table(res) self._logger.info("Evaluation results for sem_seg: \n" + small_table) if self._dump: dump_info_one_task = { "task": "sem_seg", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return results
def preprocess_image(self, batched_inputs, training): """ Normalize, pad and batch the input images. """ images = [x["image"].to(self.device) for x in batched_inputs] bs = len(images) images = [self.normalizer(x) for x in images] images = ImageList.from_tensors(images, size_divisibility=0, pad_ref_long=True) # sync image size for all gpus comm.synchronize() if training and self.iter % self.change_iter == 0: if self.iter < self.max_iter - 20000: meg = torch.LongTensor(1).to(self.device) comm.synchronize() if comm.is_main_process(): size = np.random.choice(self.multi_size) meg.fill_(size) if comm.get_world_size() > 1: comm.synchronize() dist.broadcast(meg, 0) self.size = meg.item() comm.synchronize() else: self.size = 608 if training: # resize image inputs modes = ['bilinear', 'nearest', 'bicubic', 'area'] mode = modes[random.randrange(4)] if mode == 'bilinear' or mode == 'bicubic': images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode, align_corners=False) else: images.tensor = F.interpolate(images.tensor, size=[self.size, self.size], mode=mode) if "instances" in batched_inputs[0]: gt_instances = [ x["instances"].to(self.device) for x in batched_inputs ] elif "targets" in batched_inputs[0]: log_first_n( logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10) gt_instances = [ x["targets"].to(self.device) for x in batched_inputs ] else: gt_instances = None targets = [ torch.cat([ instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor ], dim=-1) for instance in gt_instances ] labels = torch.zeros((bs, 100, 5)) for i, target in enumerate(targets): labels[i][:target.shape[0]] = target labels[:, :, 1:] = labels[:, :, 1:] / 512. * self.size else: labels = None self.iter += 1 return images, labels
def default_setup(cfg, args): """ Perform some basic common setups at the beginning of a job, including: 1. Set up the cvpods logger 2. Log basic information about environment, cmdline arguments, and config 3. Backup the config to the output directory Args: cfg (BaseConfig): the full config to be used args (argparse.NameSpace): the command line arguments to be logged """ output_dir = cfg.OUTPUT_DIR if comm.is_main_process() and output_dir: PathManager.mkdirs(output_dir) rank = comm.get_rank() # setup_logger(output_dir, distributed_rank=rank, name="cvpods") logger = setup_logger(output_dir, distributed_rank=rank) logger.info("Rank of current process: {}. World size: {}".format( rank, comm.get_world_size())) logger.info("Environment info:\n" + collect_env_info()) logger.info("Command line arguments: " + str(args)) if hasattr(args, "config_file") and args.config_file != "": logger.info("Contents of args.config_file={}:\n{}".format( args.config_file, PathManager.open(args.config_file, "r").read())) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.show_diff(base_config))) # if comm.is_main_process() and output_dir: # # Note: some of our scripts may expect the existence of # # config.yaml in output directory # path = os.path.join(output_dir, "config.yaml") # with PathManager.open(path, "w") as f: # f.write(cfg.dump()) # logger.info("Full config saved to {}".format(os.path.abspath(path))) # make sure each worker has a different, yet deterministic seed if specified seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) # cudnn benchmark has large overhead. It shouldn't be used considering the small size of # typical validation set. if not (hasattr(args, "eval_only") and args.eval_only): torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK # dynamic adjust batch_size, steps according to world size base_world_size = int(cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE) world_size = comm.get_world_size() ratio = world_size / base_world_size cfg.SOLVER.IMS_PER_BATCH = int(ratio * cfg.SOLVER.IMS_PER_BATCH) cfg.SOLVER.LR_SCHEDULER.MAX_ITER = int(cfg.SOLVER.LR_SCHEDULER.MAX_ITER / ratio) # Divided by scale ratio when using iterations rather than epochs if cfg.SOLVER.LR_SCHEDULER.MAX_EPOCH is None: cfg.SOLVER.LR_SCHEDULER.STEPS = list( (int(step / ratio) for step in cfg.SOLVER.LR_SCHEDULER.STEPS)) cfg.SOLVER.CHECKPOINT_PERIOD = int(cfg.SOLVER.CHECKPOINT_PERIOD / ratio) cfg.TEST.EVAL_PERIOD = int(cfg.TEST.EVAL_PERIOD / ratio) cfg.SOLVER.OPTIMIZER.BASE_LR = ratio * cfg.SOLVER.OPTIMIZER.BASE_LR assert cfg.SOLVER.IMS_PER_BATCH / cfg.SOLVER.IMS_PER_DEVICE == world_size return cfg, logger
def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg # cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.OptimizationHook( accumulate_grad_steps=cfg.SOLVER.BATCH_SUBDIVISIONS, grad_clipper=None, mixed_precision=cfg.TRAINER.FP16.ENABLED), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.IterationTimer(), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=self.max_iter, max_epoch=self.max_epoch)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results def save_best_model(): key = cfg.TEST.SORT_BY assert hasattr( self, '_last_eval_results'), "Must run after test_and_save_results()" max_value = 0.0 if self._max_eval_results is None else flatten_results_dict( self._max_eval_results)[key] cur_value = flatten_results_dict(self._last_eval_results)[key] if cur_value >= max_value: self._max_eval_results = self._last_eval_results """ start save checkpoint """ self.checkpointer.save("model_best") return None # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if cfg.TEST.get('SORT_BY'): # save max metric checkpoint, which means early stopping self._max_eval_results = None ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, save_best_model)) if comm.is_main_process(): # Here the default print/log frequency of each writer is used. # run writers in the end, so that evaluation metrics are written ret.append( hooks.PeriodicWriter(self.build_writers(), period=self.window_size)) ret.append( hooks.PeriodicWriter(self.build_everystep_writers(), period=1)) return ret