def test_checkpoint_resume(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: trainer = SimpleTrainer(model, dataloader, opt) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) trainer.register_hooks([ hooks.LRScheduler(scheduler=scheduler), # checkpoint after scheduler to properly save the state of scheduler hooks.PeriodicCheckpointer(checkpointer, 10), ]) trainer.train(0, 12) self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5) self.assertEqual(scheduler.last_epoch, 12) del trainer opt = torch.optim.SGD(model.parameters(), 999) # lr will be loaded trainer = SimpleTrainer(model, dataloader, opt) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) trainer.register_hooks([ hooks.LRScheduler(scheduler=scheduler), ]) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) checkpointer.resume_or_load("non_exist.pth") self.assertEqual( trainer.iter, 11) # last finished iter number (0-based in Trainer) # number of times `scheduler.step()` was called (1-based) self.assertEqual(scheduler.last_epoch, 12) self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
def test_checkpoint_resume(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: trainer = SimpleTrainer(model, dataloader, opt) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) trainer.register_hooks( [ hooks.PeriodicCheckpointer(checkpointer, 10), hooks.LRScheduler(scheduler=scheduler), ] ) trainer.train(0, 12) del trainer trainer = SimpleTrainer(model, dataloader, opt) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) trainer.register_hooks( [ hooks.LRScheduler(scheduler=scheduler), ] ) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) checkpointer.resume_or_load("non_exist.pth") self.assertEqual(trainer.iter, 11) # last finished iter self.assertEqual(scheduler.last_epoch, 11)
def training_step(self, batch, batch_idx): data_time = time.perf_counter() - self.data_start # Need to manually enter/exit since trainer may launch processes # This ideally belongs in setup, but setup seems to run before processes are spawned if self.storage is None: self.storage = EventStorage(0) self.storage.__enter__() self.iteration_timer.trainer = weakref.proxy(self) self.iteration_timer.before_step() self.writers = ( default_writers(self.cfg.OUTPUT_DIR, self.max_iter) if comm.is_main_process() else {} ) loss_dict = self.model(batch) SimpleTrainer.write_metrics(loss_dict, data_time) opt = self.optimizers() self.storage.put_scalar( "lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False ) self.iteration_timer.after_step() self.storage.step() # A little odd to put before step here, but it's the best way to get a proper timing self.iteration_timer.before_step() if self.storage.iter % 20 == 0: for writer in self.writers: writer.write() return sum(loss_dict.values())
def __init__(self, cfg, optimization_level): logger = logging.getLogger("detectron2") if not logger.isEnabledFor(logging.INFO): setup_logger() model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) model, optimizer = amp.initialize(model, optimizer, opt_level=optimization_level) data_loader = self.build_train_loader(cfg) SimpleTrainer.__init__(self, model, data_loader, optimizer) self.scheduler = self.build_lr_scheduler(cfg, optimizer) self.checkpointer = DetectionCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler, ) self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg self.register_hooks(self.build_hooks())
def test_simple_trainer(self, device="cpu"): device = torch.device(device) model = SimpleModel(nn.Linear(10, 10)).to(device) def data_loader(): while True: yield torch.rand(3, 3).to(device) trainer = SimpleTrainer(model, data_loader(), torch.optim.SGD(model.parameters(), 0.1)) trainer.train(0, 10)
def __init__(self, cfg, weights: Union[str, Dict[str, Any]]): self.start_iter = 0 self.max_iter = cfg.SOLVER.MAX_ITER self.cfg = cfg # We do not make any super call here and implement `__init__` from # `DefaultTrainer`: we need to initialize mixed precision model before # wrapping to DDP, so we need to do it this way. model = self.build_model(cfg) optimizer = self.build_optimizer(cfg, model) data_loader = self.build_train_loader(cfg) scheduler = self.build_lr_scheduler(cfg, optimizer) # Load pre-trained weights before wrapping to DDP because `ApexDDP` has # some weird issue with `DetectionCheckpointer`. # fmt: off if isinstance(weights, str): # weights are ``str`` means ImageNet init or resume training. self.start_iter = (DetectionCheckpointer( model, optimizer=optimizer, scheduler=scheduler).resume_or_load(weights, resume=True).get( "iteration", -1) + 1) elif isinstance(weights, dict): # weights are a state dict means our pretrain init. DetectionCheckpointer(model)._load_model(weights) # fmt: on # Enable distributed training if we have multiple GPUs. Use Apex DDP for # non-FPN backbones because its `delay_allreduce` functionality helps with # gradient checkpointing. if dist.get_world_size() > 1: if global_cfg.get("GRADIENT_CHECKPOINT", False): model = ApexDDP(model, delay_allreduce=True) else: model = nn.parallel.DistributedDataParallel( model, device_ids=[dist.get_rank()], broadcast_buffers=False) # Call `__init__` from grandparent class: `SimpleTrainer`. SimpleTrainer.__init__(self, model, data_loader, optimizer) self.scheduler = scheduler self.checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=self.scheduler) self.register_hooks(self.build_hooks())
def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel( model, device_ids=[comm.get_local_rank()], broadcast_buffers=False ) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): while True: yield from DatasetFromList(dummy_data, copy=False) max_iter = 400 trainer = SimpleTrainer(model, f(), optimizer) trainer.register_hooks( [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])] ) trainer.train(1, max_iter)
def test_writer_hooks(self): model = _SimpleModel(sleep_sec=0.1) trainer = SimpleTrainer(model, self._data_loader("cpu"), torch.optim.SGD(model.parameters(), 0.1)) max_iter = 50 with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: json_file = os.path.join(d, "metrics.json") writers = [CommonMetricPrinter(max_iter), JSONWriter(json_file)] trainer.register_hooks([ hooks.EvalHook(0, lambda: {"metric": 100}), hooks.PeriodicWriter(writers) ]) with self.assertLogs(writers[0].logger) as logs: trainer.train(0, max_iter) with open(json_file, "r") as f: data = [json.loads(line.strip()) for line in f] self.assertEqual([x["iteration"] for x in data], [19, 39, 49, 50]) # the eval metric is in the last line with iter 50 self.assertIn("metric", data[-1], "Eval metric must be in last line of JSON!") # test logged messages from CommonMetricPrinter self.assertEqual(len(logs.output), 3) for log, iter in zip(logs.output, [19, 39, 49]): self.assertIn(f"iter: {iter}", log) self.assertIn("eta: 0:00:00", logs.output[-1], "Last ETA must be 0!")
def test_best_checkpointer(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) metric_name = "metric" total_iter = 40 test_period = 10 test_cases = [ ("max", iter([0.3, 0.4, 0.35, 0.5]), 3), ("min", iter([1.0, 0.8, 0.9, 0.9]), 2), ("min", iter([math.nan, 0.8, 0.9, 0.9]), 1), ] for mode, metrics, call_count in test_cases: trainer = SimpleTrainer(model, dataloader, opt) with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) trainer.register_hooks([ hooks.EvalHook(test_period, lambda: {metric_name: next(metrics)}), hooks.BestCheckpointer(test_period, checkpointer, metric_name, mode=mode), ]) with mock.patch.object(checkpointer, "save") as mock_save_method: trainer.train(0, total_iter) self.assertEqual(mock_save_method.call_count, call_count)
def test_build_model(self): cfg = self._get_default_cfg() cfg.INPUT.MIN_SIZE_TRAIN = (60, ) cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3 cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5 cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset", ) cfg.MODEL.DEVICE = "cpu" cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator" with register_toy_coco_dataset( "toy_dataset", image_size=(80, 60), # w, h num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG, ): model = self.runner.build_model(cfg) trainer = SimpleTrainer(model, data_loader=[], optimizer=None) trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)] trainer.register_hooks(trainer_hooks) trainer.before_train() anchor_generator = model.proposal_generator.anchor_generator cell_anchors = list(anchor_generator.cell_anchors) gt_anchors = np.array([ [-20, -15, 20, 15] # toy_dataset's bbox is half size of image for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS) ]) np.testing.assert_allclose(cell_anchors[0], gt_anchors)
def test_build_model(self): cfg = self._get_default_cfg() cfg.INPUT.MIN_SIZE_TRAIN = (60,) cfg.MODEL.KMEANS_ANCHORS.KMEANS_ANCHORS_ON = True cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS = 3 cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG = 5 cfg.MODEL.KMEANS_ANCHORS.DATASETS = ("toy_dataset",) cfg.MODEL.DEVICE = "cpu" cfg.MODEL.ANCHOR_GENERATOR.NAME = "KMeansAnchorGenerator" with make_temp_directory("detectron2go_tmp_dataset") as dataset_dir: image_dir = os.path.join(dataset_dir, "images") os.makedirs(image_dir) image_generator = LocalImageGenerator(image_dir, width=80, height=60) with register_toy_dataset( "toy_dataset", image_generator, num_images=cfg.MODEL.KMEANS_ANCHORS.NUM_TRAINING_IMG, ): model = self.runner.build_model(cfg) trainer = SimpleTrainer(model, data_loader=[], optimizer=None) trainer_hooks = [compute_kmeans_anchors_hook(self.runner, cfg)] trainer.register_hooks(trainer_hooks) trainer.before_train() anchor_generator = model.proposal_generator.anchor_generator cell_anchors = [x for x in anchor_generator.cell_anchors] gt_anchors = np.array( [ [-20, -15, 20, 15] # toy_dataset's bbox is half size of image for _ in range(cfg.MODEL.KMEANS_ANCHORS.NUM_CLUSTERS) ] ) np.testing.assert_allclose(cell_anchors[0], gt_anchors)
def test_eval_hook(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) for total_iter, period, eval_count in [(30, 15, 2), (31, 15, 3), (20, 0, 1)]: test_func = mock.Mock(return_value={"metric": 3.0}) trainer = SimpleTrainer(model, dataloader, opt) trainer.register_hooks([hooks.EvalHook(period, test_func)]) trainer.train(0, total_iter) self.assertEqual(test_func.call_count, eval_count)
def test_simple_trainer(self, device="cpu"): model = _SimpleModel().to(device=device) trainer = SimpleTrainer(model, self._data_loader(device), torch.optim.SGD(model.parameters(), 0.1)) trainer.train(0, 10)