def do_train(args, cfg): """ Args: cfg: an object with the following attributes: model: instantiate to a module dataloader.{train,test}: instantiate to dataloaders dataloader.evaluator: instantiate to evaluator for test set optimizer: instantaite to an optimizer lr_multiplier: instantiate to a fvcore scheduler train: other misc config defined in `configs/common/train.py`, including: output_dir (str) init_checkpoint (str) amp.enabled (bool) max_iter (int) eval_period, log_period (int) device (str) checkpointer (dict) ddp (dict) """ model = instantiate(cfg.model) logger = logging.getLogger("detectron2") logger.info("Model:\n{}".format(model)) model.to(cfg.train.device) cfg.optimizer.params.model = model optim = instantiate(cfg.optimizer) train_loader = instantiate(cfg.dataloader.train) model = create_ddp_model(model, **cfg.train.ddp) trainer = (AMPTrainer if cfg.train.amp.enabled else SimpleTrainer)( model, train_loader, optim) checkpointer = DetectionCheckpointer( model, cfg.train.output_dir, optimizer=optim, trainer=trainer, ) trainer.register_hooks([ hooks.IterationTimer(), hooks.LRScheduler(scheduler=instantiate(cfg.lr_multiplier)), hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) if comm.is_main_process() else None, hooks.EvalHook(cfg.train.eval_period, lambda: do_test(cfg, model)), hooks.PeriodicWriter( default_writers(cfg.train.output_dir, cfg.train.max_iter), period=cfg.train.log_period, ) if comm.is_main_process() else None, ]) checkpointer.resume_or_load(cfg.train.init_checkpoint, resume=args.resume) if args.resume and checkpointer.has_checkpoint(): # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration start_iter = trainer.iter + 1 else: start_iter = 0 trainer.train(start_iter, cfg.train.max_iter)
class DefaultTrainer(TrainerBase): """ A trainer with default training logic. It does the following: 1. Create a :class:`SimpleTrainer` using model, optimizer, dataloader defined by the given config. Create a LR scheduler defined by the config. 2. Load the last checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when `resume_or_load` is called. 3. Register a few common hooks defined by the config. It is created to simplify the **standard model training workflow** and reduce code boilerplate for users who only need the standard training workflow, with standard features. It means this class makes *many assumptions* about your training logic that may easily become invalid in a new research. In fact, any assumptions beyond those made in the :class:`SimpleTrainer` are too much for research. The code of this class has been annotated about restrictive assumptions it makes. When they do not work for you, you're encouraged to: 1. Overwrite methods of this class, OR: 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and nothing else. You can then add your own hooks if needed. OR: 3. Write your own training loop similar to `tools/plain_train_net.py`. See the :doc:`/tutorials/training` tutorials for more details. Note that the behavior of this class, like other functions/classes in this file, is not stable, since it is meant to represent the "common default behavior". It is only guaranteed to work well with the standard models and training workflow in detectron2. To obtain more stable behavior, write your own training logic with other public APIs. Examples: :: trainer = DefaultTrainer(cfg) trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS trainer.train() Attributes: scheduler: checkpointer (DetectionCheckpointer): cfg (CfgNode): """ def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("detectron2") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by a `last_checkpoint` file), resume from the file. Resuming means loading all available states (eg. optimizer and scheduler) and update iteration counter from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. Otherwise, this is considered as an independent training. The method will load model weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start from iteration 0. Args: resume (bool): whether to do resume or not """ checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) if resume and self.checkpointer.has_checkpoint(): self.start_iter = checkpoint.get("iteration", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). if isinstance(self.model, DistributedDataParallel): # broadcast loaded data/model from the first rank, because other # machines may not have access to the checkpoint file if TORCH_VERSION >= (1, 7): self.model._sync_params_and_buffers() self.start_iter = comm.all_gather(self.start_iter)[0] def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) # Changes for COSMOS model: we don't need to save the predictions # def test_and_save_results(): # self._last_eval_results = self.test(self.cfg, self.model) # return self._last_eval_results # # # Do evaluation after checkpointer, because then if it fails, # # we can use the saved checkpoint to debug. # ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # Here the default print/log frequency of each writer is used. # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret def build_writers(self): """ Build a list of writers to be used using :func:`default_writers()`. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. """ return default_writers(self.cfg.OUTPUT_DIR, self.max_iter) def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results def run_step(self): self._trainer.iter = self.iter self._trainer.run_step() @classmethod def build_model(cls, cfg): """ Returns: torch.nn.Module: It now calls :func:`detectron2.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`detectron2.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_train_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_train_loader(cfg) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_test_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_test_loader(cfg, dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name): """ Returns: DatasetEvaluator or None It is not implemented by default. """ raise NotImplementedError(""" If you want DefaultTrainer to automatically run evaluation, please implement `build_evaluator()` in subclasses (see train_net.py for example). Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example). """) @classmethod def test(cls, cfg, model, evaluators=None): """ Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as ``cfg.DATASETS.TEST``. Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len( cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators)) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method.") results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i) logger.info("Evaluation results for {} in csv format:".format( dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results @staticmethod def auto_scale_workers(cfg, num_workers: int): """ When the config is defined for certain number of workers (according to ``cfg.SOLVER.REFERENCE_WORLD_SIZE``) that's different from the number of workers currently in use, returns a new cfg where the total batch size is scaled so that the per-GPU batch size stays the same as the original ``IMS_PER_BATCH // REFERENCE_WORLD_SIZE``. Other config options are also scaled accordingly: * training steps and warmup steps are scaled inverse proportionally. * learning rate are scaled proportionally, following :paper:`ImageNet in 1h`. For example, with the original config like the following: .. code-block:: yaml IMS_PER_BATCH: 16 BASE_LR: 0.1 REFERENCE_WORLD_SIZE: 8 MAX_ITER: 5000 STEPS: (4000,) CHECKPOINT_PERIOD: 1000 When this config is used on 16 GPUs instead of the reference number 8, calling this method will return a new config with: .. code-block:: yaml IMS_PER_BATCH: 32 BASE_LR: 0.2 REFERENCE_WORLD_SIZE: 16 MAX_ITER: 2500 STEPS: (2000,) CHECKPOINT_PERIOD: 500 Note that both the original config and this new config can be trained on 16 GPUs. It's up to user whether to enable this feature (by setting ``REFERENCE_WORLD_SIZE``). Returns: CfgNode: a new config. Same as original if ``cfg.SOLVER.REFERENCE_WORLD_SIZE==0``. """ old_world_size = cfg.SOLVER.REFERENCE_WORLD_SIZE if old_world_size == 0 or old_world_size == num_workers: return cfg cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() assert (cfg.SOLVER.IMS_PER_BATCH % old_world_size == 0), "Invalid REFERENCE_WORLD_SIZE in config!" scale = num_workers / old_world_size bs = cfg.SOLVER.IMS_PER_BATCH = int( round(cfg.SOLVER.IMS_PER_BATCH * scale)) lr = cfg.SOLVER.BASE_LR = cfg.SOLVER.BASE_LR * scale max_iter = cfg.SOLVER.MAX_ITER = int(round(cfg.SOLVER.MAX_ITER / scale)) warmup_iter = cfg.SOLVER.WARMUP_ITERS = int( round(cfg.SOLVER.WARMUP_ITERS / scale)) cfg.SOLVER.STEPS = tuple( int(round(s / scale)) for s in cfg.SOLVER.STEPS) cfg.TEST.EVAL_PERIOD = int(round(cfg.TEST.EVAL_PERIOD / scale)) cfg.SOLVER.CHECKPOINT_PERIOD = int( round(cfg.SOLVER.CHECKPOINT_PERIOD / scale)) cfg.SOLVER.REFERENCE_WORLD_SIZE = num_workers # maintain invariant logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the config to batch_size={bs}, learning_rate={lr}, " f"max_iter={max_iter}, warmup={warmup_iter}.") if frozen: cfg.freeze() return cfg
class DefaultTrainer(SimpleTrainer): """ A trainer with default training logic. Compared to `SimpleTrainer`, it contains the following logic in addition: 1. Create model, optimizer, scheduler, dataloader from the given config. 2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when `resume_or_load` is called. 3. Register a few common hooks. It is created to simplify the **standard model training workflow** and reduce code boilerplate for users who only need the standard training workflow, with standard features. It means this class makes *many assumptions* about your training logic that may easily become invalid in a new research. In fact, any assumptions beyond those made in the :class:`SimpleTrainer` are too much for research. The code of this class has been annotated about restrictive assumptions it mades. When they do not work for you, you're encouraged to: 1. Overwrite methods of this class, OR: 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and nothing else. You can then add your own hooks if needed. OR: 3. Write your own training loop similar to `tools/plain_train_net.py`. Also note that the behavior of this class, like other functions/classes in this file, is not stable, since it is meant to represent the "common default behavior". It is only guaranteed to work well with the standard models and training workflow in detectron2. To obtain more stable behavior, write your own training logic with other public APIs. Examples: :: trainer = DefaultTrainer(cfg) trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS trainer.train() Attributes: scheduler: checkpointer (DetectionCheckpointer): cfg (CfgNode): """ def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("detectron2") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for d2 setup_logger() # Assume these objects must be constructed in this order. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = DetectionCheckpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True`, and last checkpoint exists, resume from it, load all checkpointables (eg. optimizer and scheduler) and update iteration counter. Otherwise, load the model specified by the config (skip all checkpointables) and start from the first iteration. Args: resume (bool): whether to do resume or not """ checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) if resume and self.checkpointer.has_checkpoint(): self.start_iter = checkpoint.get("iteration", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( # Run at the same freq as (but before) evaluation. cfg.TEST.EVAL_PERIOD, self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret def build_writers(self): """ Build a list of writers to be used. By default it contains writers that write metrics to the screen, a json file, and a tensorboard event file respectively. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. It is now implemented by: :: return [ CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] """ # Here the default print/log frequency of each writer is used. return [ # It may not always print what you want to see, since it prints "common" metrics only. CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" verify_results(self.cfg, self._last_eval_results) return self._last_eval_results @classmethod def build_model(cls, cfg): """ Returns: torch.nn.Module: It now calls :func:`detectron2.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`detectron2.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`detectron2.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_train_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_train_loader(cfg) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`detectron2.data.build_detection_test_loader`. Overwrite it if you'd like a different data loader. """ return build_detection_test_loader(cfg, dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name): """ Returns: DatasetEvaluator or None It is not implemented by default. """ raise NotImplementedError(""" If you want DefaultTrainer to automatically run evaluation, please implement `build_evaluator()` in subclasses (see train_net.py for example). Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example). """) @classmethod def test(cls, cfg, model, evaluators=None): """ Args: cfg (CfgNode): model (nn.Module): evaluators (list[DatasetEvaluator] or None): if None, will call :meth:`build_evaluator`. Otherwise, must have the same length as `cfg.DATASETS.TEST`. Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) if isinstance(evaluators, DatasetEvaluator): evaluators = [evaluators] if evaluators is not None: assert len( cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( len(cfg.DATASETS.TEST), len(evaluators)) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TEST): data_loader = cls.build_test_loader(cfg, dataset_name) # When evaluators are passed in as arguments, # implicitly assume that evaluators can be created before data_loader. if evaluators is not None: evaluator = evaluators[idx] else: try: evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " "or implement its `build_evaluator` method.") results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results_i, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results_i) logger.info("Evaluation results for {} in csv format:".format( dataset_name)) print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results
class BaselineTrainer(DefaultTrainer): def __init__(self, cfg): """ Args: cfg (CfgNode): Use the custom checkpointer, which loads other backbone models with matching heuristics. """ cfg = DefaultTrainer.auto_scale_workers(cfg, comm.get_world_size()) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) TrainerBase.__init__(self) self._trainer = (AMPTrainer if cfg.SOLVER.AMP.ENABLED else SimpleTrainer)( model, data_loader, optimizer ) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by a `last_checkpoint` file), resume from the file. Resuming means loading all available states (eg. optimizer and scheduler) and update iteration counter from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. Otherwise, this is considered as an independent training. The method will load model weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start from iteration 0. Args: resume (bool): whether to do resume or not """ checkpoint = self.checkpointer.resume_or_load( self.cfg.MODEL.WEIGHTS, resume=resume ) if resume and self.checkpointer.has_checkpoint(): self.start_iter = checkpoint.get("iteration", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). if isinstance(self.model, DistributedDataParallel): # broadcast loaded data/model from the first rank, because other # machines may not have access to the checkpoint file if TORCH_VERSION >= (1, 7): self.model._sync_params_and_buffers() self.start_iter = comm.all_gather(self.start_iter)[0] def train_loop(self, start_iter: int, max_iter: int): """ Args: start_iter, max_iter (int): See docs above """ logger = logging.getLogger(__name__) logger.info("Starting training from iteration {}".format(start_iter)) self.iter = self.start_iter = start_iter self.max_iter = max_iter with EventStorage(start_iter) as self.storage: try: self.before_train() for self.iter in range(start_iter, max_iter): self.before_step() self.run_step() self.after_step() except Exception: logger.exception("Exception during training:") raise finally: self.after_train() def run_step(self): self._trainer.iter = self.iter assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" start = time.perf_counter() data = next(self._trainer._data_loader_iter) data_time = time.perf_counter() - start record_dict, _, _, _ = self.model(data, branch="supervised") num_gt_bbox = 0.0 for element in data: num_gt_bbox += len(element["instances"]) num_gt_bbox = num_gt_bbox / len(data) record_dict["bbox_num/gt_bboxes"] = num_gt_bbox loss_dict = {} for key in record_dict.keys(): if key[:4] == "loss" and key[-3:] != "val": loss_dict[key] = record_dict[key] losses = sum(loss_dict.values()) metrics_dict = record_dict metrics_dict["data_time"] = data_time self._write_metrics(metrics_dict) self.optimizer.zero_grad() losses.backward() self.optimizer.step() @classmethod def build_evaluator(cls, cfg, dataset_name, output_folder=None): if output_folder is None: output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") evaluator_list = [] evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type if evaluator_type == "coco": evaluator_list.append(COCOEvaluator( dataset_name, output_dir=output_folder)) elif evaluator_type == "pascal_voc": return PascalVOCDetectionEvaluator(dataset_name) if len(evaluator_list) == 0: raise NotImplementedError( "no Evaluator for the dataset {} with the type {}".format( dataset_name, evaluator_type ) ) elif len(evaluator_list) == 1: return evaluator_list[0] return DatasetEvaluators(evaluator_list) @classmethod def build_train_loader(cls, cfg): return build_detection_semisup_train_loader(cfg, mapper=None) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable """ return build_detection_test_loader(cfg, dataset_name) def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), hooks.PreciseBN( cfg.TEST.EVAL_PERIOD, self.model, self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, ) if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) else None, ] if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer( self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD ) ) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) return ret def _write_metrics(self, metrics_dict: dict): """ Args: metrics_dict (dict): dict of scalar metrics """ metrics_dict = { k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v) for k, v in metrics_dict.items() } # gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in detectron2. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): if "data_time" in all_metrics_dict[0]: data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) self.storage.put_scalar("data_time", data_time) metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } loss_dict = {} for key in metrics_dict.keys(): if key[:4] == "loss": loss_dict[key] = metrics_dict[key] total_losses_reduced = sum(loss for loss in loss_dict.values()) self.storage.put_scalar("total_loss", total_losses_reduced) if len(metrics_dict) > 1: self.storage.put_scalars(**metrics_dict)
def main(args): cfg = setup(args) # eval_only and eval_during_train are mainly used for jointly # training detection and self-supervised models. if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) return res elif args.eval_during_train: model = Trainer.build_model(cfg) check_pointer = DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR) saved_checkpoint = None best_res = {} best_file = None while True: if check_pointer.has_checkpoint(): current_ckpt = check_pointer.get_checkpoint_file() if (saved_checkpoint is None or current_ckpt != saved_checkpoint): check_pointer._load_model( check_pointer._load_file(current_ckpt)) saved_checkpoint = current_ckpt print("evaluating checkpoint {}".format(current_ckpt)) iters = int( osp.splitext( osp.basename(current_ckpt))[0].split("_")[-1]) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) print(res) if (len(best_res) == 0) or ( len(best_res) > 0 and best_res["bbox"]["AP"] < res["bbox"]["AP"]): best_res = res best_file = current_ckpt print("best so far is from {}".format(best_file)) print(best_res) if iters + 1 >= cfg.SOLVER.MAX_ITER: return best_res time.sleep(10) """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the trainer. """ trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) if cfg.TEST.AUG.ENABLED: trainer.register_hooks([ hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) ]) return trainer.train()