def main(): print('start job ...') start_time = time.time() # 1. create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-shanghai', 'cn-zhangjiakou', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, instance_type=INSTANCE_TYPE, disable_nas=True, spot=True, install_script='') init_ncluster = time.time() print('init ncluster:', init_ncluster - start_time) # 2. upload GTC code job.run('yum install -y unzip') job.upload('GTC') job.run( 'cd GTC && wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/dataset.zip ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/test.JPG ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/resnet50-19c8e357.pth ' + '&& conda activate torch_1.3_cu10.0_py36') upload_data = time.time() print('upload_data time:', upload_data - init_ncluster) # 3. prepare the dataset job.run('unzip -o dataset.zip') unzip_time = time.time() print('unzip data:', unzip_time - upload_data) # 4. run the training job job.tasks[0].run('conda activate torch_1.3_cu10.0_py36') job.tasks[0].run('./run-perseus.sh 2>&1 | tee logs.log', non_blocking=False) train_time = time.time() print('training time:', train_time - unzip_time) # 5. run the inference job job.tasks[0].run('python inference.py 2>&1 | tee logs.inference.log', non_blocking=False) print('inference time:', time.time() - train_time) eclapse_time = time.time() - start_time print(f'training and inference deploy time is: {eclapse_time} s.') # 6. stop the instance (optional) job.stop()
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, ) job.upload('training') if ncluster.get_region() == 'local': job.run('conda activate main') # specific to image else: job.run(f'conda activate fastai') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ # '--load', f'/ncluster/models/{args.name}.pth', # '--save', f'/ncluster/models/{args.name}.pth' ] # params = ['--phases', schedules[args.machines]] params = [] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) train_script = 'training/train.py' # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}' # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True)
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE) job.rsync('.') # job.upload('setup.sh') # job.upload('worker_requirements.txt') # todo(y): replace with rsync job.run('bash setup.sh') # job.upload('training') job.run(f'source activate pytorch_source') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ '~/data/imagenet', '--fp16', '--logdir', job.logdir, '--name', args.name, '--distributed', '--init-bn0', '--no-bn-wd', ] params = ['--phases', schedules[args.machines]] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}" job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, ) job.upload('src') job.upload('scripts') job.run(f'conda activate midi') job.run('cd scripts') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ # '--load', f'/ncluster/models/{args.name}.pth', '--path', '~/data/midi/v10/midi_encode/' ] params = [ '--save', f'large_single/lq/1_ep44', '--cache', 'tmp/lq', '--batch_size', '8', '--large', '--single_stream', '--epochs', '44', '--lr', '.008' ] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) train_script = 'run_txl_npenc.py' # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}' # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True)
def main(): start_time = time.time() # 1. Create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, disable_nas=True, spot=True, instance_type=INSTANCE_TYPE) # 2. Upload perseus bert code. job.run('yum install -y unzip') job.upload('perseus-bert') job.run('conda activate tensorflow_1.14_cu10.0_py36') # 3. Download pretrain model and dataset. BERT_CHINESE_BASE_DIR = '/root/chinese_L-12_H-768_A-12' DATA_DIR = '/root/toutiao_data' job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/chinese_L-12_H-768_A-12.zip && unzip chinese_L-12_H-768_A-12.zip' ) job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/toutiao_data.tgz && tar xvf toutiao_data.tgz' ) # 4. Run the training job. job.run('cd perseus-bert') hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks] host_str = ','.join(hosts) mpi_cmd = [ 'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}', f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none', '-x NCCL_DEBUG=INFO', '-x PATH', '-x PYTHONPATH', '-x LD_LIBRARY_PATH', '-x XLA_FLAGS' ] bert_classifier_cmd = [ 'python run_classifier.py', '--task_name=news', '--do_train=true', '--do_eval=true', f'--data_dir={DATA_DIR}', f'--vocab_file={BERT_CHINESE_BASE_DIR}/vocab.txt', f'--bert_config_file={BERT_CHINESE_BASE_DIR}/bert_config.json', f'--init_checkpoint={BERT_CHINESE_BASE_DIR}/bert_model.ckpt', '--max_seq_length=128', '--train_batch_size=48', '--learning_rate=8e-5', '--num_train_epochs=3.0', '--warmup_proportion=0.8', '--output_dir=/root/output_dir', '--use_amp=true', '--use_perseus=true', '--use_xla=true' ] cmd = mpi_cmd + bert_classifier_cmd cmd = " ".join(cmd) job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd') job.tasks[0].run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") eclapse_time = time.time() - start_time print(f'training deploy time is: {eclapse_time} s.') job.stop()
def main(): if args.image_name == 'pytorch.imagenet.source.v7': supported_regions = ['us-west-2', 'us-east-1', 'us-east-2'] assert ncluster.get_region( ) in supported_regions, f"required AMI {args.image_name} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" if args.mount_imagenet: datadir = '/data/imagenet' else: datadir = '~/data/imagenet' os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS if args.num_tasks >= 16: assert args.simple_ring_setup, "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config['env_' + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f'/usr/local/cuda' EFA_HOME = f'/opt/amazon/efa' MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node assert NPROC_PER_NODE <= task0.num_gpus, f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" NUM_GPUS = NPROC_PER_NODE * args.num_tasks config['NUM_GPUS'] = NUM_GPUS config['internal_id'] = u.get_account_number() config['internal_alias'] = u.get_account_name() config['region'] = u.get_region() config['zone'] = u.get_zone() config['launch_user'] = os.environ.get('USER', '') config['cmd'] = ' '.join(sys.argv) config['launcher_conda'] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config['launcher_cmd'] = 'python ' + ' '.join(sys.argv) config['logdir'] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: assert u.get_zone(), "Must specify zone when reusing EBS volumes" mount_imagenet(job) if not args.skip_setup: job.run( 'rm -f *.py') # remove files backed into imagenet18 release image job.run('conda init') # missing .bashrc job.run( f'{{ source activate {args.conda_env} && bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} ' ) if args.pytorch_nightly: job.run( 'conda install -y -c pytorch pytorch-nightly && bash setup.sh') else: job.run([ f'source ~/.bashrc && conda activate {args.conda_env}', f'killall python || echo hi' ]) job.rsync('.') if args.efa: assert 'efa' in args.image_name # make sure we use EFA-enabled image hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += 'CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO ' else: env_params += 'NCCL_DEBUG=INFO ' env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: assert args.pytorch_nightly env_params += " PYTORCH_USE_SPAWN=1 " if 'WANDB_API_KEY' in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, '--fp16', '--logdir', job.logdir, '--name', f'{args.run_name}-{util.random_id()}', '--distributed', '--init-bn0', '--no-bn-wd', '--log_all_workers', args.log_all_workers, ] params = ['--phases', util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{env_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = 'efa' if args.pseudo_efa: FI_PROVIDER = 'sockets' local_env = util.format_env_export( LOCAL_RANK='$OMPI_COMM_WORLD_LOCAL_RANK', RANK='$OMPI_COMM_WORLD_RANK', WORLD_SIZE='$OMPI_COMM_WORLD_SIZE', MASTER_ADDR=task0.ip, MASTER_PORT=6016) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f'{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64', NCCL_DEBUG='INFO', OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get('WANDB_API_KEY', ''), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = 'training/env_test.py' else: worker_script_fn = 'training/train_imagenet_nv.py' local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f'python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK' ] local_cmd = ' '.join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f'{mpi_env} ', f'--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ', f'--bind-to none ', f"bash -c '{local_cmd}'" ] cmd = ' '.join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")
def main(): start_time = time.time() # 1. Create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", #image_name='aiacc-dlimg-centos7:1.3.0.post3', num_tasks=args.machines, instance_type=INSTANCE_TYPE, spot=True, disable_nas=True, ) # 2. Upload perseus faster-rcnn code. job.upload('gluon-cv') job.run('conda activate mxnet_1.5.1.post0_cu10.0_py36') # 3. Download pretrain model and dataset. job.run('mkdir /root/mscoco') job.run( 'cd /root/mscoco && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/annotations/annotations_trainval2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/train2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/test2017.zip' ) job.run( 'wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/coco2017/zips/val2017.zip' ) job.run('mkdir -p /root/.mxnet/models') job.run( 'cd /root/.mxnet/models && wget -c -t 10 http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/pretrain_model/resnet50_v1b-0ecdba34.params' ) # 4. install requirements. job.run('cd /root/gluon-cv/') job.run('pip install -r requirements.txt') job.run('python mscoco.py') # 5. Run the training job. hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks] host_str = ','.join(hosts) mpi_cmd = [ 'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}', f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none', '-x NCCL_DEBUG=INFO', '-x PATH', '-x LD_LIBRARY_PATH', ] insightface_cmd = './train-perseus.sh' cmd = mpi_cmd cmd = " ".join(cmd) + " " + insightface_cmd job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd') job.tasks[0].run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") eclapse_time = time.time() - start_time print(f'training deploy time is: {eclapse_time} s.')
def main(): if args.image_name == "pytorch.imagenet.source.v7": supported_regions = ["us-west-2", "us-east-1", "us-east-2"] if ncluster.get_region() not in supported_regions: raise ValueError( f"required AMI {args.image_name} has only been made available " f"in regions {supported_regions}, but your current region " f"is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)") if args.machines not in schedules: raise ValueError( f"{args.machines} not supported, only support {schedules.keys()}" ) if args.mount_imagenet: datadir = "/data/imagenet" else: datadir = "~/data/imagenet" os.environ["NCLUSTER_AWS_FAST_ROOTDISK"] = "1" # use io2 disk on AWS if args.num_tasks >= 16: if not args.simple_ring_setup: raise ValueError( "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" ) job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] # _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config["env_" + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f"/usr/local/cuda" EFA_HOME = f"/opt/amazon/efa" MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node if NPROC_PER_NODE > task0.num_gpus: raise ValueError( f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" ) NUM_GPUS = NPROC_PER_NODE * args.num_tasks config["NUM_GPUS"] = NUM_GPUS config["internal_id"] = u.get_account_number() config["internal_alias"] = u.get_account_name() config["region"] = u.get_region() config["zone"] = u.get_zone() config["launch_user"] = os.environ.get("USER", "") config["cmd"] = " ".join(sys.argv) config["launcher_conda"] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config["launcher_cmd"] = "python " + " ".join(sys.argv) config["logdir"] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: if not u.get_zone(): raise ValueError("Must specify zone when reusing EBS volumes") mount_imagenet(job) if not args.skip_setup: job.run( "rm -f *.py") # remove files backed into imagenet18 release image job.run("conda init") # missing .bashrc job.run( f"{{ source activate {args.conda_env} && " f"bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} " ) if args.pytorch_nightly: job.run( "conda install -y -c pytorch pytorch-nightly && bash setup.sh") else: job.run([ f"source ~/.bashrc && conda activate {args.conda_env}", f"killall python || echo hi" ]) job.rsync(".") if args.efa: if "efa" not in args.image_name: raise ValueError("make sure we use EFA-enabled image") unused_hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += "CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO " else: env_params += "NCCL_DEBUG=INFO " env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: if not args.pytorch_nightly: raise ValueError() env_params += " PYTORCH_USE_SPAWN=1 " if "WANDB_API_KEY" in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, "--fp16", "--logdir", job.logdir, "--name", f"{args.run_name}-{util.random_id()}", "--distributed", "--init-bn0", "--no-bn-wd", "--log_all_workers", args.log_all_workers, ] params = ["--phases", util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = " ".join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = ( f"--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} " f"--node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}" ) cmd = ( f"{env_params} python -m torch.distributed.launch {dist_params} " f"training/train_imagenet_nv.py {training_params}") task.run( f"echo {cmd} > {job.logdir}/task-{i}.cmd") # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = "efa" if args.pseudo_efa: FI_PROVIDER = "sockets" local_env = util.format_env_export( LOCAL_RANK="$OMPI_COMM_WORLD_LOCAL_RANK", RANK="$OMPI_COMM_WORLD_RANK", WORLD_SIZE="$OMPI_COMM_WORLD_SIZE", MASTER_ADDR=task0.ip, MASTER_PORT=6016, ) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f"{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64", NCCL_DEBUG="INFO", OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get("WANDB_API_KEY", ""), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = "training/env_test.py" else: worker_script_fn = "training/train_imagenet_nv.py" local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f"python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK", ] local_cmd = " ".join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f"{mpi_env} ", f"--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ", f"--bind-to none ", f"bash -c '{local_cmd}'", ] cmd = " ".join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")