def entry_point(config: ConfigParser): ''' entry-point function for a single worker, distributed training ''' local_world_size = config['local_world_size'] # check distributed environment cfgs if config['distributed']: # distributed gpu mode # check gpu available if torch.cuda.is_available(): if torch.cuda.device_count() < local_world_size: raise RuntimeError( f'the number of GPU ({torch.cuda.device_count()}) is less than ' f'the number of processes ({local_world_size}) running on each node' ) local_master = (config['local_rank'] == 0) else: raise RuntimeError( 'CUDA is not available, Distributed training is not supported.' ) else: # one gpu or cpu mode if config['local_world_size'] != 1: raise RuntimeError( 'local_world_size must set be to 1, if distributed is set to false.' ) config.update_config('local_rank', 0) local_master = True config.update_config('global_rank', 0) logger = config.get_logger('train') if local_master else None if config['distributed']: logger.info('Distributed GPU training model start...' ) if local_master else None else: logger.info( 'One GPU or CPU training mode start...') if local_master else None if config['distributed']: # these are the parameters used to initialize the process group env_dict = { key: os.environ[key] for key in ('MASTER_ADDR', 'MASTER_PORT', 'RANK', 'WORLD_SIZE') } logger.info( f'[Process {os.getpid()}] Initializing process group with: {env_dict}' ) if local_master else None # init process group dist.init_process_group(backend='nccl', init_method='env://') config.update_config('global_rank', dist.get_rank()) # info distributed training cfg logger.info( f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' + f'rank = {dist.get_rank()}, backend={dist.get_backend()}' ) if local_master else None # start train main(config, local_master, logger if local_master else None) # tear down the process group dist.destroy_process_group()
def entry_point(config: ConfigParser): ''' entry-point function for a single worker distributed training a single worker contain (torch.cuda.device_count() / local_world_size) gpus ''' local_world_size = config['local_world_size'] # check distributed environment cfgs if config['distributed']: # distributed gpu mode # check gpu available if torch.cuda.is_available(): if torch.cuda.device_count() < local_world_size: raise RuntimeError( f'the number of GPU ({torch.cuda.device_count()}) is less than ' f'the number of processes ({local_world_size}) running on each node' ) local_master = (config['local_rank'] == 0) else: raise RuntimeError( 'CUDA is not available, Distributed training is not supported.' ) else: # one gpu or cpu mode if config['local_world_size'] != 1: raise RuntimeError( 'local_world_size must set be to 1, if distributed is set to false.' ) config.update_config('local_rank', 0) local_master = True config.update_config('global_rank', 0) logger = config.get_logger('train') if local_master else None if config['distributed']: logger.info('Distributed GPU training model start...' ) if local_master else None else: logger.info( 'One GPU or CPU training mode start...') if local_master else None # else: # sys.stdin.close() # cfg CUDNN whether deterministic if config['deterministic']: fix_random_seed_for_reproduce(config['seed']) logger.warn( 'You have chosen to deterministic training. ' 'This will fix random seed, turn on the CUDNN deterministic setting, turn off the CUDNN benchmark ' 'which can slow down your training considerably! ' ) if local_master else None else: torch.backends.cudnn.benchmark = True logger.warn( 'You have chosen to benchmark training. ' 'This will turn on the CUDNN benchmark setting' 'which can speed up your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints due to RandomizedMultiLinearMap need deterministic turn on.' ) if local_master else None if config['distributed']: # init process group dist.init_process_group(backend='nccl', init_method='env://') config.update_config('global_rank', dist.get_rank()) # log distributed training cfg logger.info( f'[Process {os.getpid()}] world_size = {dist.get_world_size()}, ' + f'rank = {dist.get_rank()}, backend={dist.get_backend()}' ) if local_master else None # start train main(config, local_master, logger if local_master else None) # tear down the process group dist.destroy_process_group()