def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists pathmgr.mkdirs(cfg.OUT_DIR) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log torch, cuda, and cudnn versions version = [ torch.__version__, torch.version.cuda, torch.backends.cudnn.version() ] logger.info( "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) env = "".join( [f"{key}: {value}\n" for key, value in sorted(os.environ.items())]) logger.info(f"os.environ:\n{env}") # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else () logger.info(logging.dump_log_data(cfg, "cfg", None)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def setup_logging(): """Sets up the logging.""" # Enable logging only for the master process if dist.is_master_proc(): # # Clear the root logger to prevent any existing logging config # # (e.g. set by another module) from messing with our setup # logging.root.handlers = [] # # Construct logging configuration # logging_config = {"level": logging.INFO, "format": _FORMAT} # # Log either to stdout or to a file # if cfg.LOG_DEST == "stdout": # logging_config["stream"] = sys.stdout # else: # logging_config["filename"] = os.path.join(cfg.OUT_DIR, _LOG_FILE) # # Configure logging # logging.basicConfig(**logging_config) # else: # _suppress_print() logging.basicConfig( level=logging.DEBUG, format=colored("[%(asctime)s]", "green") + " %(message)s", datefmt="%m/%d %H:%M:%S", handlers=[ logging.FileHandler(os.path.join(cfg.OUT_DIR, cfg.LOG_DEST)), logging.StreamHandler() ])
def save_checkpoint(model, optimizer, epoch): """Saves a checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): return # Ensure that the checkpoint dir exists os.makedirs(get_checkpoint_dir(), exist_ok=True) # Omit the DDP wrapper in the multi-gpu setting sd = model.module.state_dict() if cfg.NUM_GPUS > 1 else model.state_dict() # Record the state if isinstance(optimizer, list): checkpoint = { "epoch": epoch, "model_state": sd, "optimizer_w_state": optimizer[0].state_dict(), "optimizer_a_state": optimizer[1].state_dict(), "cfg": cfg.dump(), } else: checkpoint = { "epoch": epoch, "model_state": sd, "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) torch.save(checkpoint, checkpoint_file) return checkpoint_file
def get_weights_file(weights_file): """Download weights file if stored as a URL.""" download = dist.is_master_proc(local=True) weights_file = cache_url(weights_file, cfg.DOWNLOAD_CACHE, download=download) if cfg.NUM_GPUS > 1: torch.distributed.barrier() return weights_file
def setup_logging(): """Sets up the logging.""" # Enable logging only for the master process if dist.is_master_proc(): logging.basicConfig( level=logging.DEBUG, format=colored("[%(asctime)s]", "green") + " %(message)s", datefmt="%m/%d %H:%M:%S", handlers=[ logging.FileHandler(os.path.join(cfg.OUT_DIR, cfg.LOG_DEST)), logging.StreamHandler(), ], )
def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log the config logger.info("Config:\n{}".format(cfg)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def setup_logging(): """Sets up the logging.""" # Enable logging only for the master process if dist.is_master_proc(): # Clear the root logger to prevent any existing logging config # (e.g. set by another module) from messing with our setup logging.root.handlers = [] # Construct logging configuration logging_config = {"level": logging.INFO, "format": _FORMAT} # Log either to stdout or to a file if cfg.LOG_DEST == "stdout": logging_config["stream"] = sys.stdout else: logging_config["filename"] = os.path.join(cfg.OUT_DIR, _LOG_FILE) # Configure logging logging.basicConfig(**logging_config) else: _suppress_print()
def save_checkpoint(model, optimizer, epoch): """Saves a checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): return # Ensure that the checkpoint dir exists os.makedirs(get_checkpoint_dir(), exist_ok=True) # Record the state checkpoint = { "epoch": epoch, "model_state": unwrap_model(model).state_dict(), "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) torch.save(checkpoint, checkpoint_file) return checkpoint_file
def save_checkpoint(model, optimizer, epoch, best): """Saves a checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): return # Ensure that the checkpoint dir exists pathmgr.mkdirs(get_checkpoint_dir()) # Record the state checkpoint = { "epoch": epoch, "model_state": unwrap_model(model).state_dict(), "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) with pathmgr.open(checkpoint_file, "wb") as f: torch.save(checkpoint, f) # If best copy checkpoint to the best checkpoint if best: pathmgr.copy(checkpoint_file, get_checkpoint_best()) return checkpoint_file
def save_checkpoint(model, model_ema, optimizer, epoch, test_err, ema_err): """Saves a checkpoint and also the best weights so far in a best checkpoint.""" # Save checkpoints only from the master process if not dist.is_master_proc(): return # Ensure that the checkpoint dir exists pathmgr.mkdirs(get_checkpoint_dir()) # Record the state checkpoint = { "epoch": epoch, "test_err": test_err, "ema_err": ema_err, "model_state": unwrap_model(model).state_dict(), "ema_state": unwrap_model(model_ema).state_dict(), "optimizer_state": optimizer.state_dict(), "cfg": cfg.dump(), } # Write the checkpoint checkpoint_file = get_checkpoint(epoch + 1) with pathmgr.open(checkpoint_file, "wb") as f: torch.save(checkpoint, f) # Store the best model and model_ema weights so far if not pathmgr.exists(get_checkpoint_best()): pathmgr.copy(checkpoint_file, get_checkpoint_best()) else: with pathmgr.open(get_checkpoint_best(), "rb") as f: best = torch.load(f, map_location="cpu") # Select the best model weights and the best model_ema weights if test_err < best["test_err"] or ema_err < best["ema_err"]: if test_err < best["test_err"]: best["model_state"] = checkpoint["model_state"] best["test_err"] = test_err if ema_err < best["ema_err"]: best["ema_state"] = checkpoint["ema_state"] best["ema_err"] = ema_err with pathmgr.open(get_checkpoint_best(), "wb") as f: torch.save(best, f) return checkpoint_file