def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup( cfg, args ) # if you don't like any of the default setup, write your own setup code return cfg
def main(args): config.merge_from_list(args.opts) cfg, logger = default_setup(config, args) if args.debug: batches = int(cfg.SOLVER.IMS_PER_DEVICE * args.num_gpus) if cfg.SOLVER.IMS_PER_BATCH != batches: cfg.SOLVER.IMS_PER_BATCH = batches logger.warning( "SOLVER.IMS_PER_BATCH is changed to {}".format(batches)) valid_files = get_valid_files(args, cfg, logger) # * means all if need specific format then *.csv for current_file in valid_files: cfg.MODEL.WEIGHTS = current_file model = build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) if cfg.TEST.AUG.ENABLED: res = Trainer.test_with_TTA(cfg, model) else: res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res)
def main(args): config.merge_from_list(args.opts) cfg, logger = default_setup(config, args) """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the runner. """ runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model) runner.resume_or_load(resume=args.resume) # check wheather worksapce has enough storeage space # assume that a single dumped model is 700Mb file_sys = os.statvfs(cfg.OUTPUT_DIR) free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30 eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10 if eval_space_Gb > free_space_Gb: logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) " f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}") if cfg.TEST.AUG.ENABLED: runner.register_hooks([ hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model)) ]) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.diff(base_config))) if args.eval_only: runner.test(cfg, runner.model) return runner.train()
def stage_main(args, cfg, build): logger = logging.getLogger(__name__) assert comm.get_world_size() == 1, "DEBUG mode only supported for 1 GPU" cfg.merge_from_list(args.opts) cfg, logger = default_setup(cfg, args) model = build(cfg) optimizer = build_optimizer(cfg, model) debug_ckpt = Checkpointer(model, resume=True, optimizer=optimizer) ckpt_file = args.ckpt_file if ckpt_file is None: # find latest checkpoint in log dir if ckpt_file is not given log_dir = "./log" matched_files = [ os.path.join(log_dir, files) for files in os.listdir(log_dir) if re.match("debug_.*.pth", files) is not None ] ckpt_file = sorted(matched_files, key=os.path.getatime)[-1] left_dict = debug_ckpt.load(ckpt_file) assert "inputs" in left_dict, "input data not found in checkpoints" data = left_dict["inputs"] trainer = DebugTrainer(model, data, optimizer) logger.info("start run models") trainer.run_step() logger.info("finish debuging")
def main(args, config, build_model): config.merge_from_list(args.opts) cfg = default_setup(config, args) """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the runner. """ runner = runner_decrator(RUNNERS.get(cfg.TRAINER.NAME))(cfg, build_model) runner.resume_or_load(resume=args.resume) extra_hooks = [] if args.clearml: from cvpods.engine.clearml import ClearMLHook if comm.is_main_process(): extra_hooks.append(ClearMLHook()) if cfg.TEST.AUG.ENABLED: extra_hooks.append( hooks.EvalHook(0, lambda: runner.test_with_TTA(cfg, runner.model))) if extra_hooks: runner.register_hooks(extra_hooks) logger.info("Running with full config:\n{}".format(cfg)) base_config = cfg.__class__.__base__() logger.info("different config with base class:\n{}".format( cfg.diff(base_config))) runner.train() if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN: # convert last ckpt to pretrain format convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR, "model_final.pth"), save_path=os.path.join( cfg.OUTPUT_DIR, "model_final_pretrain_weight.pkl"))
def main(args): config.merge_from_list(args.opts) cfg, logger = default_setup(config, args) if args.debug: batches = int(cfg.SOLVER.IMS_PER_BATCH / 8 * args.num_gpus) if cfg.SOLVER.IMS_PER_BATCH != batches: cfg.SOLVER.IMS_PER_BATCH = batches logger.warning( "SOLVER.IMS_PER_BATCH is changed to {}".format(batches)) if "MODEL.WEIGHTS" in args.opts: if cfg.MODEL.WEIGHTS.endswith(".pth") and not PathManager.exists( cfg.MODEL.WEIGHTS): ckpt_name = cfg.MODEL.WEIGHTS.split("/")[-1] model_prefix = cfg.OUTPUT_DIR.split("cvpods_playground")[1][1:] remote_file_path = os.path.join(cfg.OSS.DUMP_PREFIX, model_prefix, ckpt_name) logger.warning( f"The specified ckpt file ({cfg.MODEL.WEIGHTS}) was not found locally," f" try to load the corresponding dump file on OSS ({remote_file_path})." ) cfg.MODEL.WEIGHTS = remote_file_path valid_files = [cfg.MODEL.WEIGHTS] else: list_of_files = glob.glob(os.path.join(cfg.OUTPUT_DIR, '*.pth')) assert list_of_files, "No checkpoint file found in {}.".format( cfg.OUTPUT_DIR) list_of_files.sort(key=os.path.getctime) latest_file = list_of_files[-1] if not args.end_iter: valid_files = [latest_file] else: files = [f for f in list_of_files if str(f) <= str(latest_file)] valid_files = [] for f in files: try: model_iter = int(re.split(r'(model_|\.pth)', f)[-3]) except Exception: logger.warning("remove {}".format(f)) continue if args.start_iter <= model_iter <= args.end_iter: valid_files.append(f) assert valid_files, "No .pth files satisfy your requirement" # * means all if need specific format then *.csv for current_file in valid_files: cfg.MODEL.WEIGHTS = current_file model = build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model))
def stage_main(args, cfg, build): cfg.merge_from_list(args.opts) cfg, logger = default_setup(cfg, args) model_build_func = build """ If you'd like to do anything fancier than the standard training logic, consider writing your own training loop or subclassing the trainer. """ trainer = Trainer(cfg, model_build_func) trainer.resume_or_load(resume=args.resume) if args.eval_only: DefaultCheckpointer(trainer.model, save_dir=cfg.OUTPUT_DIR, resume=args.resume).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, trainer.model) if comm.is_main_process(): verify_results(cfg, res) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, trainer.model)) return res # check wheather worksapce has enough storeage space # assume that a single dumped model is 700Mb file_sys = os.statvfs(cfg.OUTPUT_DIR) free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30 eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10 if eval_space_Gb > free_space_Gb: logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) " f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}") if cfg.TEST.AUG.ENABLED: trainer.register_hooks([ hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model)) ]) trainer.train() if comm.is_main_process() and cfg.MODEL.AS_PRETRAIN: # convert last ckpt to pretrain format convert_to_pretrained_model(input=os.path.join(cfg.OUTPUT_DIR, "model_final.pth"), save_path=os.path.join( cfg.OUTPUT_DIR, "model_final_pretrain_weight.pkl"))