def stage_main(args, cfg, build): logger = logging.getLogger(__name__) assert comm.get_world_size() == 1, "DEBUG mode only supported for 1 GPU" cfg.merge_from_list(args.opts) cfg, logger = default_setup(cfg, args) model = build(cfg) optimizer = build_optimizer(cfg, model) debug_ckpt = Checkpointer(model, resume=True, optimizer=optimizer) ckpt_file = args.ckpt_file if ckpt_file is None: # find latest checkpoint in log dir if ckpt_file is not given log_dir = "./log" matched_files = [ os.path.join(log_dir, files) for files in os.listdir(log_dir) if re.match("debug_.*.pth", files) is not None ] ckpt_file = sorted(matched_files, key=os.path.getatime)[-1] left_dict = debug_ckpt.load(ckpt_file) assert "inputs" in left_dict, "input data not found in checkpoints" data = left_dict["inputs"] trainer = DebugTrainer(model, data, optimizer) logger.info("start run models") trainer.run_step() logger.info("finish debuging")
def benchmark_train(args): cfg = setup(args) model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) cfg.defrost() cfg.DATALOADER.NUM_WORKERS = 0 data_loader = build_detection_train_loader(cfg) dummy_data = list(itertools.islice(data_loader, 100)) def f(): while True: yield from DatasetFromList(dummy_data, copy=False) max_iter = 400 trainer = SimpleTrainer(model, f(), optimizer) trainer.register_hooks([ hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]) ]) trainer.train(1, max_iter)
def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: It now calls :func:`cvpods.solver.build_optimizer`. Overwrite it if you'd like a different optimizer. """ return build_optimizer(cfg, model)
def build_optimizer(cls, cfg, model): """ Returns: torch.optim.Optimizer: """ return build_optimizer(cfg, model)
def do_train(cfg, model, resume=False): model.train() optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) checkpointer = DefaultCheckpointer( model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler ) start_iter = ( checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 ) max_iter = cfg.SOLVER.MAX_ITER periodic_checkpointer = PeriodicCheckpointer( checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter ) writers = ( [ CommonMetricPrinter(max_iter), JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), TensorboardXWriter(cfg.OUTPUT_DIR), ] if comm.is_main_process() else [] ) # compared to "train_net.py", we do not support accurate timing and # precise BN here, because they are not trivial to implement data_loader = build_train_loader(cfg) logger.info("Starting training from iteration {}".format(start_iter)) with EventStorage(start_iter) as storage: for data, iteration in zip(data_loader, range(start_iter, max_iter)): iteration = iteration + 1 storage.step() loss_dict = model(data) losses = sum(loss for loss in loss_dict.values()) assert torch.isfinite(losses).all(), loss_dict loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} losses_reduced = sum(loss for loss in loss_dict_reduced.values()) if comm.is_main_process(): storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) optimizer.zero_grad() losses.backward() optimizer.step() storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() if ( cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): for writer in writers: writer.write() periodic_checkpointer.step(iteration)