def benchmark_train(args):
    cfg = setup(args)
    model = build_model(cfg)
    logger.info("Model:\n{}".format(model))
    if comm.get_world_size() > 1:
        model = DistributedDataParallel(
            model, device_ids=[comm.get_local_rank()], broadcast_buffers=False
        )
    optimizer = build_optimizer(cfg, model)
    checkpointer = DetectionCheckpointer(model, optimizer=optimizer)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    cfg.defrost()
    cfg.DATALOADER.NUM_WORKERS = 0
    data_loader = build_detection_train_loader(cfg)
    dummy_data = list(itertools.islice(data_loader, 100))

    def f():
        while True:
            yield from DatasetFromList(dummy_data, copy=False)

    max_iter = 400
    trainer = SimpleTrainer(model, f(), optimizer)
    trainer.register_hooks(
        [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])]
    )
    trainer.train(1, max_iter)
예제 #2
0
 def test_best_checkpointer(self):
     model = _SimpleModel()
     dataloader = self._data_loader("cpu")
     opt = torch.optim.SGD(model.parameters(), 0.1)
     metric_name = "metric"
     total_iter = 40
     test_period = 10
     test_cases = [
         ("max", iter([0.3, 0.4, 0.35, 0.5]), 3),
         ("min", iter([1.0, 0.8, 0.9, 0.9]), 2),
         ("min", iter([math.nan, 0.8, 0.9, 0.9]), 1),
     ]
     for mode, metrics, call_count in test_cases:
         trainer = SimpleTrainer(model, dataloader, opt)
         with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
             checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
             trainer.register_hooks([
                 hooks.EvalHook(test_period,
                                lambda: {metric_name: next(metrics)}),
                 hooks.BestCheckpointer(test_period,
                                        checkpointer,
                                        metric_name,
                                        mode=mode),
             ])
             with mock.patch.object(checkpointer,
                                    "save") as mock_save_method:
                 trainer.train(0, total_iter)
                 self.assertEqual(mock_save_method.call_count, call_count)
예제 #3
0
    def test_writer_hooks(self):
        model = _SimpleModel(sleep_sec=0.1)
        trainer = SimpleTrainer(model, self._data_loader("cpu"),
                                torch.optim.SGD(model.parameters(), 0.1))

        max_iter = 50

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            json_file = os.path.join(d, "metrics.json")
            writers = [CommonMetricPrinter(max_iter), JSONWriter(json_file)]

            trainer.register_hooks([
                hooks.EvalHook(0, lambda: {"metric": 100}),
                hooks.PeriodicWriter(writers)
            ])
            with self.assertLogs(writers[0].logger) as logs:
                trainer.train(0, max_iter)

            with open(json_file, "r") as f:
                data = [json.loads(line.strip()) for line in f]
                self.assertEqual([x["iteration"] for x in data],
                                 [19, 39, 49, 50])
                # the eval metric is in the last line with iter 50
                self.assertIn("metric", data[-1],
                              "Eval metric must be in last line of JSON!")

            # test logged messages from CommonMetricPrinter
            self.assertEqual(len(logs.output), 3)
            for log, iter in zip(logs.output, [19, 39, 49]):
                self.assertIn(f"iter: {iter}", log)

            self.assertIn("eta: 0:00:00", logs.output[-1],
                          "Last ETA must be 0!")
예제 #4
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
                # checkpoint after scheduler to properly save the state of scheduler
                hooks.PeriodicCheckpointer(checkpointer, 10),
            ])

            trainer.train(0, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
            self.assertEqual(scheduler.last_epoch, 12)
            del trainer

            opt = torch.optim.SGD(model.parameters(), 999)  # lr will be loaded
            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
            ])
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(
                trainer.iter,
                11)  # last finished iter number (0-based in Trainer)
            # number of times `scheduler.step()` was called (1-based)
            self.assertEqual(scheduler.last_epoch, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
예제 #5
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks(
                [
                    hooks.PeriodicCheckpointer(checkpointer, 10),
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )

            trainer.train(0, 12)
            del trainer

            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks(
                [
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(trainer.iter, 11)  # last finished iter
            self.assertEqual(scheduler.last_epoch, 11)
예제 #6
0
    def test_eval_hook(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)

        for total_iter, period, eval_count in [(30, 15, 2), (31, 15, 3), (20, 0, 1)]:
            test_func = mock.Mock(return_value={"metric": 3.0})
            trainer = SimpleTrainer(model, dataloader, opt)
            trainer.register_hooks([hooks.EvalHook(period, test_func)])
            trainer.train(0, total_iter)
            self.assertEqual(test_func.call_count, eval_count)
예제 #7
0
    def test_simple_trainer(self, device="cpu"):
        device = torch.device(device)
        model = SimpleModel(nn.Linear(10, 10)).to(device)

        def data_loader():
            while True:
                yield torch.rand(3, 3).to(device)

        trainer = SimpleTrainer(model, data_loader(),
                                torch.optim.SGD(model.parameters(), 0.1))
        trainer.train(0, 10)
예제 #8
0
 def test_simple_trainer(self, device="cpu"):
     model = _SimpleModel().to(device=device)
     trainer = SimpleTrainer(model, self._data_loader(device),
                             torch.optim.SGD(model.parameters(), 0.1))
     trainer.train(0, 10)