def compute_time_full(model, loss_fun, train_loader, test_loader): """Times model and data loader.""" logger.info("Computing model and loader timings...") # Compute timings test_fw_time = compute_time_eval(model) train_fw_time, train_bw_time = compute_time_train(model, loss_fun) train_fw_bw_time = train_fw_time + train_bw_time train_loader_time = compute_time_loader(train_loader) # Output iter timing iter_times = { "test_fw_time": test_fw_time, "train_fw_time": train_fw_time, "train_bw_time": train_bw_time, "train_fw_bw_time": train_fw_bw_time, "train_loader_time": train_loader_time, } logger.info(logging.dump_log_data(iter_times, "iter_times")) # Output epoch timing epoch_times = { "test_fw_time": test_fw_time * len(test_loader), "train_fw_time": train_fw_time * len(train_loader), "train_bw_time": train_bw_time * len(train_loader), "train_fw_bw_time": train_fw_bw_time * len(train_loader), "train_loader_time": train_loader_time * len(train_loader), } logger.info(logging.dump_log_data(epoch_times, "epoch_times")) # Compute data loader overhead (assuming DATA_LOADER.NUM_WORKERS>1) overhead = max(0, train_loader_time - train_fw_bw_time) / train_fw_bw_time logger.info("Overhead of data loader is {:.2f}%".format(overhead * 100))
def setup_env(): """Sets up environment for training or testing.""" if dist.is_main_proc(): # Ensure that the output dir exists pathmgr.mkdirs(cfg.OUT_DIR) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log torch, cuda, and cudnn versions version = [ torch.__version__, torch.version.cuda, torch.backends.cudnn.version() ] logger.info( "PyTorch Version: torch={}, cuda={}, cudnn={}".format(*version)) env = "".join( [f"{key}: {value}\n" for key, value in sorted(os.environ.items())]) logger.info(f"os.environ:\n{env}") # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) if cfg.VERBOSE else () logger.info(logging.dump_log_data(cfg, "cfg", None)) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) random.seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def compute_time_model(model, loss_fun): """Times model.""" logger.info("Computing model timings only...") # Compute timings test_fw_time = compute_time_eval(model) train_fw_time, train_bw_time = compute_time_train(model, loss_fun) train_fw_bw_time = train_fw_time + train_bw_time # Output iter timing iter_times = { "test_fw_time": test_fw_time, "train_fw_time": train_fw_time, "train_bw_time": train_bw_time, "train_fw_bw_time": train_fw_bw_time, } logger.info(logging.dump_log_data(iter_times, "iter_times"))
def setup_env(): """Sets up environment for training or testing.""" if dist.is_master_proc(): # Ensure that the output dir exists os.makedirs(cfg.OUT_DIR, exist_ok=True) # Save the config config.dump_cfg() # Setup logging logging.setup_logging() # Log the config as both human readable and as a json logger.info("Config:\n{}".format(cfg)) logger.info(logging.dump_log_data(cfg, "cfg")) # Fix the RNG seeds (see RNG comment in core/config.py for discussion) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else () # Log model complexity logger.info(logging.dump_log_data(net.complexity(model), "complexity")) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str cur_device = torch.cuda.current_device() model = model.cuda(device=cur_device) # Use multi-process data parallel model in the multi-gpu setting if cfg.NUM_GPUS > 1: # Make model replica operate on the current device ddp = torch.nn.parallel.DistributedDataParallel model = ddp(module=model, device_ids=[cur_device], output_device=cur_device) return model
def compute_time_model(model, loss_fun): """Times model.""" logger.info("Computing model timings only...") if cfg.TRAIN.DATASET == "abnormal": logger.info("Exclude abnormal dataset") return None # Compute timings test_fw_time = compute_time_eval(model) train_fw_time, train_bw_time = compute_time_train(model, loss_fun) train_fw_bw_time = train_fw_time + train_bw_time # Output iter timing iter_times = { "test_fw_time": test_fw_time, "train_fw_time": train_fw_time, "train_bw_time": train_bw_time, "train_fw_bw_time": train_fw_bw_time, } logger.info(logging.dump_log_data(iter_times, "iter_times"))
def setup_model(): """Sets up a model for training or testing and log the results.""" # Build the model model = builders.build_model() logger.info("Model:\n{}".format(model)) if cfg.VERBOSE else () # Log model complexity logger.info(logging.dump_log_data(net.complexity(model), "complexity")) # Transfer the model to the current GPU device err_str = "Cannot use more GPU devices than available" #assert cfg.NUM_GPUS <= torch.cuda.device_count(), err_str assert cfg.NUM_GPUS <= torch.npu.device_count(), err_str cur_device = torch.npu.current_device() model = model.to(cur_device) optimizer = optim.construct_optimizer(model) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=128) if cfg.NUM_GPUS > 1: #Make model replica operate on the current device ddp = torch.nn.parallel.DistributedDataParallel model = ddp(model, device_ids=[cur_device], broadcast_buffers=False) return model, optimizer
def log_epoch_stats(self, cur_epoch): stats = self.get_epoch_stats(cur_epoch) logger.info(logging.dump_log_data(stats, "test_epoch"))
def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD != 0: return stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, "test_iter"))
def log_iter_stats(self, cur_epoch, cur_iter): if (cur_iter + 1) % cfg.LOG_PERIOD == 0: stats = self.get_iter_stats(cur_epoch, cur_iter) logger.info(logging.dump_log_data(stats, self.phase + "_iter"))