backend='nccl', #init_method='/gpfs/gpfs0/groups/mozafari/ruixliu/tmp/misc/sharedfile', rank=local_rank, world_size=local_size) set_environment_variables_for_nccl_backend(local_size == global_size) # Prepare Logger job_id = rutils.get_current_time() logger = rutils.FileLogging('%s_bert_pretrain_%d' % (job_id, local_rank)) #logger = Logger(cuda=torch.cuda.is_available()) logger.info('job id: %s' % job_id) logger.info(rutils.parser_args_to_dict(args)) # # Extact config file from blob storage job_config = BertJobConfiguration( config_file_path=os.path.join(args.config_file_path, config_file)) logger.info(job_config.config) job_name = job_config.get_name() # Setting the distributed variables #run = Run.get_context() if not use_multigpu_with_single_device_per_process: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend=args.backend)
local_rank = get_local_rank() global_size = get_global_size() local_size = get_local_size() # TODO use logger print('local_rank = {}'.format(local_rank)) print('global_size = {}'.format(global_size)) print('local_size = {}'.format(local_size)) set_environment_variables_for_nccl_backend(local_size == global_size) # Prepare Logger logger = Logger(cuda=torch.cuda.is_available()) # # Extact config file from blob storage job_config = BertJobConfiguration( config_file_path=os.path.join(path, config_file)) # Replace placeholder path prefix by path corresponding to "ds.path('data/bert_data/').as_mount()" job_config.replace_path_placeholders(path) job_name = job_config.get_name() # Setting the distributed variables run = Run.get_context() if not use_multigpu_with_single_device_per_process: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of synchronizing nodes/GPUs