def initialize(cfg=None): cfg = get_cfg(parse_args()) if cfg is None else cfg # launch multi-process for DDP # - processes will be branched off at this point # - subprocess ignores launching process and returns None if cfg.num_machines * cfg.num_gpus > 1: log.info(C.green(f"[!] Lauching Multiprocessing..")) cfg.spawn_ctx = launch(main_func=initialize, num_gpus_per_machine=cfg.num_gpus, num_machines=cfg.num_machines, machine_rank=cfg.machine_rank, dist_url=cfg.dist_url, args=(cfg, )) else: cfg.spawn_ctx = None # scatter save_dir to all of non-main ranks cfg.save_dir = comm.scatter(cfg.save_dir) # finalize config C.set_enabled(not cfg.no_color) # for sub-processes cfg.device = comm.get_local_rank() cfg.freeze() # file logging on the local ranks set_stream_handler('comm', cfg.log_level) # for sub-processes log_rank_file = f"log_rank_{comm.get_rank()}.txt" set_file_handler('main', cfg.log_level, cfg.save_dir, log_rank_file) set_stream_handler('error', cfg.log_level) set_file_handler('error', cfg.log_level, cfg.save_dir, "log_error.txt") if comm.is_main_process(): set_file_handler('result', cfg.log_level, "./", "log_result.txt") # log distriubted learning if comm.get_world_size() > 1: log.info(f"[DDP] dist_url: {cfg.dist_url}") log.info(f"[DDP] global_world_size = {comm.get_world_size()}") log.info(f"[DDP] num_gpus_per_machine = {torch.cuda.device_count()}") log.info(f"[DDP] machine_rank {cfg.machine_rank} / " f"num_machines = {cfg.num_machines}") comm.synchronize() log_comm.info(f"[DDP] rank (local: {comm.get_local_rank()}, " f"global: {comm.get_rank()}) has been spawned.") comm.synchronize() log.info(f"[DDP] Synchronized across all the ranks.") if not cfg.spawn_ctx: # This structure (including customized launch.py) is for compatibility # with our internal API. There is no functional difference from the # typical usage of distributed package. Please don't mind this # pecularity and focus on the main algorithm. for _ in train(cfg): pass return cfg
) for i in range(args.epoch): train(i, loader, model, optimizer, scheduler, scaler, device) if dist.is_primary(): torch.save(model.state_dict(), f"checkpoint/vqvae_{str(i + 1).zfill(3)}.pt") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--n_gpu", type=int, default=1) port = (2**15 + 2**14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2**14) parser.add_argument("--dist_url", default=f"tcp://127.0.0.1:{port}") parser.add_argument("--size", type=int, default=256) parser.add_argument("--epoch", type=int, default=560) parser.add_argument("--lr", type=float, default=3e-4) parser.add_argument("--disable-amp", action='store_true') parser.add_argument("--sched", type=str) parser.add_argument("path", type=str) args = parser.parse_args() print(args) dist.launch(main, args.n_gpu, 1, 0, args.dist_url, args=(args, ))
parser.add_argument("--save_steps", type=int, default=1000) parser.add_argument("--eval_steps", type=int, default=100) parser.add_argument("--batch_size", type=int, default=128) parser.add_argument("--eval_path", type=str, default=None) parser.add_argument("--config_path", type=str, default=None) parser.add_argument("--output_path", type=str, default=None) parser.add_argument("--device", type=str, default="cuda") parser.add_argument("--save_total_limit", type=int, default=5) parser.add_argument("--fp16", type=bool, default=False) parser.add_argument("--fp16_opt_level", type=str, default="01") parser.add_argument("--img_keys_path", type=str, default=None) parser.add_argument("--img_root_path", type=str, default=None) parser.add_argument("--min_lr", type=float, default=1e-5) parser.add_argument("--cycle_step", type=int, default=2000) args = parser.parse_args() assert os.path.isdir(args.img_keys_path) assert os.path.isdir(args.img_root_path) os.makedirs(args.output_path, exist_ok=True) os.makedirs(args.eval_path, exist_ok=True) print(args, file=sys.stderr, flush=True) proc_num = 1 if args.device == "cuda": proc_num = torch.cuda.device_count() print("proc_num={}".format(proc_num), file=sys.stderr, flush=True) dist.launch(main, proc_num, 1, 0, args.dist_url, args=(args, ))