def train(self): """ Full training logic """ self.not_improved_count = 0 self.improved_since_last_save = False for epoch in range(self.start_epoch, self.epochs + 1): print() self.data_loader.step(epoch) result = self.train_epoch(epoch) if self.do_validation: self.valid_data_loader.step(epoch) val_log = self.valid_epoch(epoch) result = {**result, **val_log} if self.lr_scheduler is not None: self.logger.info( f"Learning rate: {self.lr_scheduler.get_lr()}") self.lr_scheduler.step(epoch=epoch) if get_global_rank() == 0: log = self._log_info(result, epoch) early_stop = self._check_early_stop(log, epoch) if early_stop: break
def step(self, epoch): super().step(epoch) self.enable_multithreading_if_possible() if not self.fixed_dataset: self.dataset.idx_offset = epoch * len(self.dataset) seed = epoch seed = seed * get_world_size() + get_global_rank() if self.valid_loader: seed = 2**32 - seed self.dataset.set_to_lognorm() else: self.dataset.set_to_default_sim() self.dataset.simulator.set_seed(seed)
def __init__(self, args, options='', timestamp=True): # parse default and custom cli options for opt in options: args.add_argument(*opt.flags, default=None, type=opt.type) args = args.parse_args() self.resume = None if args.device: os.environ["CUDA_VISIBLE_DEVICES"] = args.device if args.resume: self.resume = Path(args.resume) self.cfg_fname = self.resume.parent / 'config.json' if args.config: self.cfg_fname = Path(args.config) msg_no_cfg = ("Configuration file need to be specified. " "Add '-c config.json', for example.") assert self.cfg_fname is not None, msg_no_cfg # load config file and apply custom cli options config = read_json(self.cfg_fname) self.__config = _update_config(config, options, args) self.__raw = copy.deepcopy(self.__config) # set save_dir where trained model and log will be saved. save_dir = Path( parse_value(self.config['trainer']['extra_args']['save_dir'])) timestamp = datetime.now().strftime( r'%m%d_%H%M%S') if timestamp else '' exper_name = self.config['name'] self.__save_dir = save_dir / 'models' / exper_name / timestamp self.__log_dir = save_dir / 'log' / exper_name / timestamp self.save_dir.mkdir(parents=True, exist_ok=True) self.log_dir.mkdir(parents=True, exist_ok=True) # save updated config file to the checkpoint dir if get_global_rank() == 0: write_json(self.config, self.save_dir / 'config.json') # configure logging module setup_logging(self.log_dir) self.log_levels = { 0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG } logger = self.get_logger('config') logger.info(f"Experiment name: {exper_name}")
def __init__(self, n_gpu): self.logger = logging.getLogger(self.__class__.__name__) self.n_gpu = n_gpu self.logger.info("Initializing devices..") device, gpu_ids, n_gpu, n_processes = self.prepare_device() self.device = device self.gpu_ids = gpu_ids self.n_gpu = n_gpu self.n_processes = n_processes if get_global_rank() == 0: self.logger.info( f"Number of running processes: {self.n_processes}") self.logger.info(f"Number of usable GPUs: {self.n_gpu}")
def __init__(self, indices, shuffle=True): self.num_replicas = get_world_size() # if num_replicas is None: # if not dist.is_available(): # raise RuntimeError( # "Requires distributed package to be available") self.rank = get_global_rank() # if rank is None: # if not dist.is_available(): # raise RuntimeError( # "Requires distributed package to be available") self.shuffle = shuffle self.indices = indices self.epoch = 0 self.num_samples = int( math.ceil(len(self.indices) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def __init__(self, model, loss, metrics, optimizer, config, data_loader, valid_data_loader, lr_scheduler, main_device): self._set_defaults(model, loss, metrics, optimizer, config, data_loader, valid_data_loader, lr_scheduler, main_device) cfg_trainer = config['trainer']['extra_args'] self.checkpoint_dir = config.save_dir if get_global_rank() == 0: # setup visualization writer instance enable_board = cfg_trainer['tensorboardX'] else: enable_board = False self.writer = WriterTensorboardX(config.log_dir, self.logger, enable_board) if config.resume is not None: self._resume_checkpoint(config.resume)