class DefaultTrainer(SimpleTrainer): """ A trainer with default training logic. Compared to `SimpleTrainer`, it contains the following logic in addition: 1. Create model, optimizer, scheduler, dataloader from the given config. 2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists. 3. Register a few common hooks. It is created to simplify the **standard model training workflow** and reduce code boilerplate for users who only need the standard training workflow, with standard features. It means this class makes *many assumptions* about your training logic that may easily become invalid in a new research. In fact, any assumptions beyond those made in the :class:`SimpleTrainer` are too much for research. The code of this class has been annotated about restrictive assumptions it mades. When they do not work for you, you're encouraged to: 1. Overwrite methods of this class, OR: 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and nothing else. You can then add your own hooks if needed. OR: 3. Write your own training loop similar to `tools/plain_train_net.py`. Also note that the behavior of this class, like other functions/classes in this file, is not stable, since it is meant to represent the "common default behavior". It is only guaranteed to work well with the standard models and training workflow in fastreid. To obtain more stable behavior, write your own training logic with other public APIs. Attributes: scheduler: checkpointer: cfg (CfgNode): Examples: .. code-block:: python trainer = DefaultTrainer(cfg) trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS trainer.train() """ def __init__(self, cfg): """ Args: cfg (CfgNode): """ logger = logging.getLogger("fastreid") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for fastreid setup_logger() # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True` # for part of the parameters is not updated. model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) super().__init__(model, data_loader, optimizer, cfg.SOLVER.AMP_ENABLED) self.scheduler = self.build_lr_scheduler(cfg, optimizer) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 if cfg.SOLVER.SWA.ENABLED: self.max_iter = cfg.SOLVER.MAX_ITER + cfg.SOLVER.SWA.ITER else: self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True`, and last checkpoint exists, resume from it. Otherwise, load a model specified by the config. Args: resume (bool): whether to do resume or not """ # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) # Reinitialize dataloader iter because when we update dataset person identity dict # to resume training, DataLoader won't update this dictionary when using multiprocess # because of the function scope. if resume and self.checkpointer.has_checkpoint(): self.start_iter = checkpoint.get("iteration", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ logger = logging.getLogger(__name__) cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN cfg.DATASETS.NAMES = tuple([cfg.TEST.PRECISE_BN.DATASET ]) # set dataset name for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), ] if cfg.SOLVER.SWA.ENABLED: ret.append( hooks.SWA( cfg.SOLVER.MAX_ITER, cfg.SOLVER.SWA.PERIOD, cfg.SOLVER.SWA.LR_FACTOR, cfg.SOLVER.SWA.ETA_MIN_LR, cfg.SOLVER.SWA.LR_SCHED, )) if cfg.TEST.PRECISE_BN.ENABLED and hooks.get_bn_modules(self.model): logger.info("Prepare precise BN dataset") ret.append( hooks.PreciseBN( # Run at the same freq as (but before) evaluation. self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, )) if cfg.MODEL.FREEZE_LAYERS != [''] and cfg.SOLVER.FREEZE_ITERS > 0: freeze_layers = ",".join(cfg.MODEL.FREEZE_LAYERS) logger.info( f'Freeze layer group "{freeze_layers}" training for {cfg.SOLVER.FREEZE_ITERS:d} iterations' ) ret.append( hooks.FreezeLayer( self.model, self.optimizer, cfg.MODEL.FREEZE_LAYERS, cfg.SOLVER.FREEZE_ITERS, )) # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation after checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), 200)) return ret def build_writers(self): """ Build a list of writers to be used. By default it contains writers that write metrics to the screen, a json file, and a tensorboard event file respectively. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. It is now implemented by: .. code-block:: python return [ CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] """ # Assume the default print/log frequency. return [ # It may not always print what you want to see, since it prints "common" metrics only. CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_iter, self.max_iter) if comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" # verify_results(self.cfg, self._last_eval_results) return self._last_eval_results @classmethod def build_model(cls, cfg): """ Returns: torch.nn.Module: It now calls :func:`fastreid.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`fastreid.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer): """ It now calls :func:`fastreid.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`fastreid.data.build_detection_train_loader`. Overwrite it if you'd like a different data loader. """ logger = logging.getLogger(__name__) logger.info("Prepare training set") return build_reid_train_loader(cfg) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`fastreid.data.build_detection_test_loader`. Overwrite it if you'd like a different data loader. """ return build_reid_test_loader(cfg, dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name, output_dir=None): data_loader, num_query = cls.build_test_loader(cfg, dataset_name) return data_loader, ReidEvaluator(cfg, num_query, output_dir) @classmethod def test(cls, cfg, model): """ Args: cfg (CfgNode): model (nn.Module): Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TESTS): logger.info("Prepare testing set") try: data_loader, evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. implement its `build_evaluator` method." ) results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results) print_csv_format(results) if len(results) == 1: results = list(results.values())[0] return results @staticmethod def auto_scale_hyperparams(cfg, data_loader): r""" This is used for auto-computation actual training iterations, because some hyper-param, such as MAX_ITER, means training epochs rather than iters, so we need to convert specific hyper-param to training iterations. """ cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() iters_per_epoch = len(data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH cfg.MODEL.HEADS.NUM_CLASSES = data_loader.dataset.num_classes cfg.SOLVER.MAX_ITER *= iters_per_epoch cfg.SOLVER.WARMUP_ITERS *= iters_per_epoch cfg.SOLVER.FREEZE_ITERS *= iters_per_epoch cfg.SOLVER.DELAY_ITERS *= iters_per_epoch for i in range(len(cfg.SOLVER.STEPS)): cfg.SOLVER.STEPS[i] *= iters_per_epoch cfg.SOLVER.SWA.ITER *= iters_per_epoch cfg.SOLVER.SWA.PERIOD *= iters_per_epoch ckpt_multiple = cfg.SOLVER.CHECKPOINT_PERIOD / cfg.TEST.EVAL_PERIOD # Evaluation period must be divided by 200 for writing into tensorboard. eval_num_mod = (200 - cfg.TEST.EVAL_PERIOD * iters_per_epoch) % 200 cfg.TEST.EVAL_PERIOD = cfg.TEST.EVAL_PERIOD * iters_per_epoch + eval_num_mod # Change checkpoint saving period consistent with evaluation period. cfg.SOLVER.CHECKPOINT_PERIOD = int(cfg.TEST.EVAL_PERIOD * ckpt_multiple) logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the config to num_classes={cfg.MODEL.HEADS.NUM_CLASSES}, " f"max_Iter={cfg.SOLVER.MAX_ITER}, wamrup_Iter={cfg.SOLVER.WARMUP_ITERS}, " f"freeze_Iter={cfg.SOLVER.FREEZE_ITERS}, delay_Iter={cfg.SOLVER.DELAY_ITERS}, " f"step_Iter={cfg.SOLVER.STEPS}, ckpt_Iter={cfg.SOLVER.CHECKPOINT_PERIOD}, " f"eval_Iter={cfg.TEST.EVAL_PERIOD}.") if frozen: cfg.freeze() return cfg
class DefaultTrainer(TrainerBase): """ A trainer with default training logic. Compared to `SimpleTrainer`, it contains the following logic in addition: 1. Create model, optimizer, scheduler, dataloader from the given config. 2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists. 3. Register a few common hooks. It is created to simplify the **standard model training workflow** and reduce code boilerplate for users who only need the standard training workflow, with standard features. It means this class makes *many assumptions* about your training logic that may easily become invalid in a new research. In fact, any assumptions beyond those made in the :class:`SimpleTrainer` are too much for research. The code of this class has been annotated about restrictive assumptions it mades. When they do not work for you, you're encouraged to: 1. Overwrite methods of this class, OR: 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and nothing else. You can then add your own hooks if needed. OR: 3. Write your own training loop similar to `tools/plain_train_net.py`. Also note that the behavior of this class, like other functions/classes in this file, is not stable, since it is meant to represent the "common default behavior". It is only guaranteed to work well with the standard models and training workflow in fastreid. To obtain more stable behavior, write your own training logic with other public APIs. Attributes: scheduler: checkpointer: cfg (CfgNode): Examples: .. code-block:: python trainer = DefaultTrainer(cfg) trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS trainer.train() """ def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("fastreid") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for fastreid setup_logger() # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader.dataset.num_classes) model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) optimizer_ckpt = dict(optimizer=optimizer) if cfg.SOLVER.FP16_ENABLED: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") optimizer_ckpt.update(dict(amp=amp)) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True` # for part of the parameters is not updated. # model = DistributedDataParallel( # model, device_ids=[comm.get_local_rank()], broadcast_buffers=False # ) model = DistributedDataParallel(model, delay_allreduce=True) self._trainer = (AMPTrainer if cfg.SOLVER.FP16_ENABLED else SimpleTrainer)(model, data_loader, optimizer) self.iters_per_epoch = len( data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH self.scheduler = self.build_lr_scheduler(cfg, optimizer, self.iters_per_epoch) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), **optimizer_ckpt, **self.scheduler, ) self.start_epoch = 0 self.max_epoch = cfg.SOLVER.MAX_EPOCH self.max_iter = self.max_epoch * self.iters_per_epoch self.warmup_iters = cfg.SOLVER.WARMUP_ITERS self.delay_epochs = cfg.SOLVER.DELAY_EPOCHS self.cfg = cfg self.register_hooks(self.build_hooks()) def resume_or_load(self, resume=True): """ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by a `last_checkpoint` file), resume from the file. Resuming means loading all available states (eg. optimizer and scheduler) and update iteration counter from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. Otherwise, this is considered as an independent training. The method will load model weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start from iteration 0. Args: resume (bool): whether to do resume or not """ # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) if resume and self.checkpointer.has_checkpoint(): self.start_epoch = checkpoint.get("epoch", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ logger = logging.getLogger(__name__) cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN cfg.DATASETS.NAMES = tuple([cfg.TEST.PRECISE_BN.DATASET ]) # set dataset name for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), ] # if cfg.SOLVER.SWA.ENABLED: # ret.append( # hooks.SWA( # cfg.SOLVER.MAX_ITER, # cfg.SOLVER.SWA.PERIOD, # cfg.SOLVER.SWA.LR_FACTOR, # cfg.SOLVER.SWA.ETA_MIN_LR, # cfg.SOLVER.SWA.LR_SCHED, # ) # ) if cfg.TEST.PRECISE_BN.ENABLED and hooks.get_bn_modules(self.model): logger.info("Prepare precise BN dataset") ret.append( hooks.PreciseBN( # Run at the same freq as (but before) evaluation. self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, )) ret.append( hooks.LayerFreeze( self.model, cfg.MODEL.FREEZE_LAYERS, cfg.SOLVER.FREEZE_ITERS, cfg.SOLVER.FREEZE_FC_ITERS, )) # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation before checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), 200)) return ret def build_writers(self): """ Build a list of writers to be used. By default it contains writers that write metrics to the screen, a json file, and a tensorboard event file respectively. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. It is now implemented by: .. code-block:: python return [ CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] """ # Assume the default print/log frequency. return [ # It may not always print what you want to see, since it prints "common" metrics only. CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_epoch, self.max_epoch, self.iters_per_epoch) if comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" return self._last_eval_results def run_step(self): self._trainer.iter = self.iter self._trainer.run_step() @classmethod def build_model(cls, cfg): """ Returns: torch.nn.Module: It now calls :func:`fastreid.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`fastreid.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer, iters_per_epoch): """ It now calls :func:`fastreid.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer, iters_per_epoch) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`fastreid.data.build_reid_train_loader`. Overwrite it if you'd like a different data loader. """ logger = logging.getLogger(__name__) logger.info("Prepare training set") return build_reid_train_loader(cfg) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`fastreid.data.build_reid_test_loader`. Overwrite it if you'd like a different data loader. """ return build_reid_test_loader(cfg, dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name, output_dir=None): data_loader, num_query = cls.build_test_loader(cfg, dataset_name) return data_loader, ReidEvaluator(cfg, num_query, output_dir) @classmethod def test(cls, cfg, model): """ Args: cfg (CfgNode): model (nn.Module): Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TESTS): logger.info("Prepare testing set") try: data_loader, evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. implement its `build_evaluator` method." ) results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP_ENABLED) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results) print_csv_format(results) if len(results) == 1: results = list(results.values())[0] return results @staticmethod def auto_scale_hyperparams(cfg, num_classes): r""" This is used for auto-computation actual training iterations, because some hyper-param, such as MAX_ITER, means training epochs rather than iters, so we need to convert specific hyper-param to training iterations. """ cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() # If you don't hard-code the number of classes, it will compute the number automatically if cfg.MODEL.HEADS.NUM_CLASSES == 0: output_dir = cfg.OUTPUT_DIR cfg.MODEL.HEADS.NUM_CLASSES = num_classes logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the num_classes={cfg.MODEL.HEADS.NUM_CLASSES}") # Update the saved config file to make the number of classes valid if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) if frozen: cfg.freeze() return cfg
class MyMixedDefaultTrainer(TrainerBase): """w/o AMP mode""" def __init__(self, cfg): """ Args: cfg (CfgNode): """ super().__init__() logger = logging.getLogger("fastreid") if not logger.isEnabledFor( logging.INFO): # setup_logger is not called for fastreid setup_logger() # Assume these objects must be constructed in this order. data_loader = self.build_train_loader(cfg) cfg = self.auto_scale_hyperparams(cfg, data_loader.dataset.num_classes) self.model = self.build_model(cfg) self.optimizer, self.param_wrapper = self.build_optimizer( cfg, self.model) # For training, wrap with DDP. But don't need this for inference. if comm.get_world_size() > 1: # ref to https://github.com/pytorch/pytorch/issues/22049 to set `find_unused_parameters=True` # for part of the parameters is not updated. self.model = DistributedDataParallel( self.model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, ) self._data_loader_iter = iter(data_loader) self.iters_per_epoch = len( data_loader.dataset) // cfg.SOLVER.IMS_PER_BATCH self.scheduler = self.build_lr_scheduler(cfg, self.optimizer, self.iters_per_epoch) # Assume no other objects need to be checkpointed. # We can later make it checkpoint the stateful hooks self.checkpointer = Checkpointer( # Assume you want to save checkpoints together with logs/statistics self.model, cfg.OUTPUT_DIR, save_to_disk=comm.is_main_process(), optimizer=self.optimizer, **self.scheduler, ) self.start_epoch = 0 self.max_epoch = cfg.SOLVER.MAX_EPOCH self.max_iter = self.max_epoch * self.iters_per_epoch self.warmup_iters = cfg.SOLVER.WARMUP_ITERS self.delay_epochs = cfg.SOLVER.DELAY_EPOCHS self.cfg = cfg self.register_hooks(self.build_hooks()) if cfg.SOLVER.AMP.ENABLED: unsupported = f"[{self.__class__.__name__}] does not support single-process multi-device training!" if isinstance(self.model, DistributedDataParallel): assert not (self.model.device_ids and len(self.model.device_ids) > 1), unsupported from torch.cuda.amp.grad_scaler import GradScaler self.grad_scaler = GradScaler() else: self.grad_scaler = None def resume_or_load(self, resume=True): """ If `resume==True` and `cfg.OUTPUT_DIR` contains the last checkpoint (defined by a `last_checkpoint` file), resume from the file. Resuming means loading all available states (eg. optimizer and scheduler) and update iteration counter from the checkpoint. ``cfg.MODEL.WEIGHTS`` will not be used. Otherwise, this is considered as an independent training. The method will load model weights from the file `cfg.MODEL.WEIGHTS` (but will not load other states) and start from iteration 0. Args: resume (bool): whether to do resume or not """ # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) if resume and self.checkpointer.has_checkpoint(): self.start_epoch = checkpoint.get("epoch", -1) + 1 # The checkpoint stores the training iteration that just finished, thus we start # at the next iteration (or iter zero if there's no checkpoint). def build_hooks(self): """ Build a list of default hooks, including timing, evaluation, checkpointing, lr scheduling, precise BN, writing events. Returns: list[HookBase]: """ logger = logging.getLogger(__name__) cfg = self.cfg.clone() cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN cfg.DATASETS.NAMES = tuple([cfg.TEST.PRECISE_BN.DATASET ]) # set dataset name for PreciseBN ret = [ hooks.IterationTimer(), hooks.LRScheduler(self.optimizer, self.scheduler), ] if cfg.TEST.PRECISE_BN.ENABLED and hooks.get_bn_modules(self.model): logger.info("Prepare precise BN dataset") ret.append( hooks.PreciseBN( # Run at the same freq as (but before) evaluation. self.model, # Build a new data loader to not affect training self.build_train_loader(cfg), cfg.TEST.PRECISE_BN.NUM_ITER, )) if len(cfg.MODEL.FREEZE_LAYERS) > 0 and cfg.SOLVER.FREEZE_ITERS > 0: ret.append( hooks.LayerFreeze( self.model, cfg.MODEL.FREEZE_LAYERS, cfg.SOLVER.FREEZE_ITERS, )) # Do PreciseBN before checkpointer, because it updates the model and need to # be saved by checkpointer. # This is not always the best: if checkpointing has a different frequency, # some checkpoints may have more precise statistics than others. def test_and_save_results(): self._last_eval_results = self.test(self.cfg, self.model) return self._last_eval_results # Do evaluation before checkpointer, because then if it fails, # we can use the saved checkpoint to debug. ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) if comm.is_main_process(): ret.append( hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) # run writers in the end, so that evaluation metrics are written ret.append(hooks.PeriodicWriter(self.build_writers(), 200)) return ret def build_writers(self): """ Build a list of writers to be used. By default it contains writers that write metrics to the screen, a json file, and a tensorboard event file respectively. If you'd like a different list of writers, you can overwrite it in your trainer. Returns: list[EventWriter]: a list of :class:`EventWriter` objects. It is now implemented by: .. code-block:: python return [ CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] """ # Assume the default print/log frequency. return [ # It may not always print what you want to see, since it prints "common" metrics only. CommonMetricPrinter(self.max_iter), JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(self.cfg.OUTPUT_DIR), ] def train(self): """ Run training. Returns: OrderedDict of results, if evaluation is enabled. Otherwise None. """ super().train(self.start_epoch, self.max_epoch, self.iters_per_epoch) if comm.is_main_process(): assert hasattr(self, "_last_eval_results" ), "No evaluation results obtained during training!" return self._last_eval_results def run_step(self): assert self.model.training, f"[{self.__class__.__name__}] model was changed to eval mode!" if self.cfg.SOLVER.AMP.ENABLED: assert torch.cuda.is_available( ), f"[{self.__class__.__name__}] CUDA is required for AMP training!" from torch.cuda.amp.autocast_mode import autocast start = time.perf_counter() data = next(self._data_loader_iter) data_time = time.perf_counter() - start if self.cfg.SOLVER.AMP.ENABLED: with autocast(): loss_dict = self.model(data) losses = sum(loss_dict.values()) self.optimizer.zero_grad() self.grad_scaler.scale(losses).backward() self._write_metrics(loss_dict, data_time) self.grad_scaler.step(self.optimizer) self.grad_scaler.update() else: loss_dict = self.model(data) losses = sum(loss_dict.values()) self.optimizer.zero_grad() losses.backward() self._write_metrics(loss_dict, data_time) self.optimizer.step() if isinstance(self.param_wrapper, ContiguousParams): self.param_wrapper.assert_buffer_is_valid() def _write_metrics(self, loss_dict: Dict[str, torch.Tensor], data_time: float): """ Args: loss_dict (dict): dict of scalar losses data_time (float): time taken by the dataloader iteration """ device = next(iter(loss_dict.values())).device # Use a new stream so these ops don't wait for DDP or backward with torch.cuda.stream(torch.cuda.Stream() if device.type == "cuda" else None): metrics_dict = { k: v.detach().cpu().item() for k, v in loss_dict.items() } metrics_dict["data_time"] = data_time # Gather metrics among all workers for logging # This assumes we do DDP-style training, which is currently the only # supported method in detectron2. all_metrics_dict = comm.gather(metrics_dict) if comm.is_main_process(): storage = get_event_storage() # data_time among workers can have high variance. The actual latency # caused by data_time is the maximum among workers. data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) storage.put_scalar("data_time", data_time) # average the rest metrics metrics_dict = { k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() } total_losses_reduced = sum(metrics_dict.values()) if not np.isfinite(total_losses_reduced): raise FloatingPointError( f"Loss became infinite or NaN at iteration={self.iter}!\n" f"loss_dict = {metrics_dict}") storage.put_scalar("total_loss", total_losses_reduced) if len(metrics_dict) > 1: storage.put_scalars(**metrics_dict) @classmethod def build_model(cls, cfg, show_model=True): """ Returns: torch.nn.Module: It now calls :func:`fastreid.modeling.build_model`. Overwrite it if you'd like a different model. """ model = build_model(cfg) if show_model: logger = logging.getLogger('fastreid') logger.info("Model:\n{}".format(model)) return model @classmethod def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`fastreid.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model) @classmethod def build_lr_scheduler(cls, cfg, optimizer, iters_per_epoch): """ It now calls :func:`fastreid.solver.build_lr_scheduler`. Overwrite it if you'd like a different scheduler. """ return build_lr_scheduler(cfg, optimizer, iters_per_epoch) @classmethod def build_train_loader(cls, cfg): """ Returns: iterable It now calls :func:`fastreid.data.build_reid_train_loader`. Overwrite it if you'd like a different data loader. """ logger = logging.getLogger(__name__) logger.info("Prepare training set") return build_reid_train_loader(cfg, combineall=cfg.DATASETS.COMBINEALL) @classmethod def build_test_loader(cls, cfg, dataset_name): """ Returns: iterable It now calls :func:`fastreid.data.build_reid_test_loader`. Overwrite it if you'd like a different data loader. """ return build_reid_test_loader(cfg, dataset_name=dataset_name) @classmethod def build_evaluator(cls, cfg, dataset_name, output_dir=None): data_loader, num_query = cls.build_test_loader(cfg, dataset_name) return data_loader, ReidEvaluator(cfg, num_query, output_dir) @classmethod def test(cls, cfg, model): """ Args: cfg (CfgNode): model (nn.Module): Returns: dict: a dict of result metrics """ logger = logging.getLogger(__name__) results = OrderedDict() for idx, dataset_name in enumerate(cfg.DATASETS.TESTS): logger.info("Prepare testing set") try: data_loader, evaluator = cls.build_evaluator(cfg, dataset_name) except NotImplementedError: logger.warn( "No evaluator found. implement its `build_evaluator` method." ) results[dataset_name] = {} continue results_i = inference_on_dataset(model, data_loader, evaluator, flip_test=cfg.TEST.FLIP.ENABLED) results[dataset_name] = results_i if comm.is_main_process(): assert isinstance( results, dict ), "Evaluator must return a dict on the main process. Got {} instead.".format( results) logger.info("Evaluation results for {} in csv format:".format( dataset_name)) results_i['dataset'] = dataset_name print_csv_format(results_i) if len(results) == 1: results = list(results.values())[0] return results @staticmethod def auto_scale_hyperparams(cfg, num_classes): r""" This is used for auto-computation actual training iterations, because some hyper-param, such as MAX_ITER, means training epochs rather than iters, so we need to convert specific hyper-param to training iterations. """ cfg = cfg.clone() frozen = cfg.is_frozen() cfg.defrost() # If you don't hard-code the number of classes, it will compute the number automatically if cfg.MODEL.HEADS.NUM_CLASSES == 0: output_dir = cfg.OUTPUT_DIR cfg.MODEL.HEADS.NUM_CLASSES = num_classes logger = logging.getLogger(__name__) logger.info( f"Auto-scaling the num_classes={cfg.MODEL.HEADS.NUM_CLASSES}") # Update the saved config file to make the number of classes valid if comm.is_main_process() and output_dir: # Note: some of our scripts may expect the existence of # config.yaml in output directory path = os.path.join(output_dir, "config.yaml") with PathManager.open(path, "w") as f: f.write(cfg.dump()) if frozen: cfg.freeze() return cfg