def main(): ec2 = u.get_ec2_resource() zone = u.get_zone() # use filtering by description since Name is not public # snapshots = list(ec2.snapshots.filter(Filters=[{'Name': 'description', 'Values': [args.snapshot]}, # {'Name': 'owner-id', 'Values': [args.snapshot_account]}])) snap = None if not args.delete: snapshots = list(ec2.snapshots.filter(Filters=[{"Name": "description", "Values": [args.snapshot]}])) if not snapshots: raise ValueError(f"no snapshot matching {args.snapshot}") if len(snapshots) >= 2: raise ValueError(f"multiple snapshots matching {args.snapshot}") snap = snapshots[0] if not args.size_gb: args.size_gb = snap.volume_size # list existing volumes vols = {} for vol in ec2.volumes.all(): vols[u.get_name(vol)] = vol print(f"{'Deleting' if args.delete else 'Making'} {args.replicas} {args.size_gb} GB replicas in {zone}") for i in range(args.volume_offset, args.replicas + args.volume_offset): vol_name = f"imagenet_{zone[-2:]}_{i:02d}" if args.delete: print(f"Deleting {vol_name}") if vol_name not in vols: print(" Not found") continue else: try: vols[vol_name].delete() except ValueError as e: print(f"Deletion of {vol_name} failed with {e}") continue if vol_name in vols: print(f"{vol_name} exists, skipping") else: vol = ec2.create_volume( Size=args.size_gb, TagSpecifications=create_volume_tags(vol_name), AvailabilityZone=zone, SnapshotId=snap.id, Iops=11500, VolumeType="io1", ) print(f"Creating {vol_name} {vol.id}")
def main(): if args.image_name == 'pytorch.imagenet.source.v7': supported_regions = ['us-west-2', 'us-east-1', 'us-east-2'] assert ncluster.get_region( ) in supported_regions, f"required AMI {args.image_name} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" if args.mount_imagenet: datadir = '/data/imagenet' else: datadir = '~/data/imagenet' os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS if args.num_tasks >= 16: assert args.simple_ring_setup, "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config['env_' + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f'/usr/local/cuda' EFA_HOME = f'/opt/amazon/efa' MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node assert NPROC_PER_NODE <= task0.num_gpus, f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" NUM_GPUS = NPROC_PER_NODE * args.num_tasks config['NUM_GPUS'] = NUM_GPUS config['internal_id'] = u.get_account_number() config['internal_alias'] = u.get_account_name() config['region'] = u.get_region() config['zone'] = u.get_zone() config['launch_user'] = os.environ.get('USER', '') config['cmd'] = ' '.join(sys.argv) config['launcher_conda'] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config['launcher_cmd'] = 'python ' + ' '.join(sys.argv) config['logdir'] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: assert u.get_zone(), "Must specify zone when reusing EBS volumes" mount_imagenet(job) if not args.skip_setup: job.run( 'rm -f *.py') # remove files backed into imagenet18 release image job.run('conda init') # missing .bashrc job.run( f'{{ source activate {args.conda_env} && bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} ' ) if args.pytorch_nightly: job.run( 'conda install -y -c pytorch pytorch-nightly && bash setup.sh') else: job.run([ f'source ~/.bashrc && conda activate {args.conda_env}', f'killall python || echo hi' ]) job.rsync('.') if args.efa: assert 'efa' in args.image_name # make sure we use EFA-enabled image hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += 'CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO ' else: env_params += 'NCCL_DEBUG=INFO ' env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: assert args.pytorch_nightly env_params += " PYTORCH_USE_SPAWN=1 " if 'WANDB_API_KEY' in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, '--fp16', '--logdir', job.logdir, '--name', f'{args.run_name}-{util.random_id()}', '--distributed', '--init-bn0', '--no-bn-wd', '--log_all_workers', args.log_all_workers, ] params = ['--phases', util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{env_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = 'efa' if args.pseudo_efa: FI_PROVIDER = 'sockets' local_env = util.format_env_export( LOCAL_RANK='$OMPI_COMM_WORLD_LOCAL_RANK', RANK='$OMPI_COMM_WORLD_RANK', WORLD_SIZE='$OMPI_COMM_WORLD_SIZE', MASTER_ADDR=task0.ip, MASTER_PORT=6016) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f'{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64', NCCL_DEBUG='INFO', OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get('WANDB_API_KEY', ''), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = 'training/env_test.py' else: worker_script_fn = 'training/train_imagenet_nv.py' local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f'python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK' ] local_cmd = ' '.join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f'{mpi_env} ', f'--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ', f'--bind-to none ', f"bash -c '{local_cmd}'" ] cmd = ' '.join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")
def mount_imagenet(job: ncluster.aws_backend.Job): """Attaches EBS disks with imagenet data to each task of the job.""" task0 = job.tasks[0] zone = u.get_zone() vols = {} ec2 = u.get_ec2_resource() for vol in ec2.volumes.all(): vols[u.get_name(vol)] = vol attach_attempted = False for i, t in enumerate(job.tasks): vol_name = f'imagenet_{zone[-2:]}_{i+args.offset:02d}' assert vol_name in vols, f"Volume {vol_name} not found, set your NCLUSTER_ZONE={zone} and run replicate_imagenet.py" vol = vols[vol_name] print(f"Attaching {vol_name} to {t.name}") if vol.attachments: instance = ec2.Instance(vol.attachments[0]['InstanceId']) if instance.id == t.instance.id: print(f"{vol_name} already attached") continue else: # attached to some other instance, detach print(f"detaching {vol_name} from {u.get_name(instance)}") vol.detach_from_instance() while vol.state != 'available': vol.reload() time.sleep(5) print( f"waiting for detachment from {u.get_name(instance)}") vol.attach_to_instance(InstanceId=t.instance.id, Device=DEFAULT_UNIX_DEVICE) attach_attempted = True else: vol.attach_to_instance(InstanceId=t.instance.id, Device=DEFAULT_UNIX_DEVICE) attach_attempted = True if attach_attempted: time.sleep(2) # wait for attachment to succeed i = 0 vol_name = f'imagenet_{zone[-2:]}_{i+args.offset:02d}' vol = vols[vol_name] vol.reload() assert vol.attachments[0]['InstanceId'] == job.tasks[0].instance.id def strip_dev(d): return d[len('/dev/'):] # attach the volume if needed df_output = task0.run('df', return_output=True) actual_device = DEFAULT_UNIX_DEVICE if '/data' not in df_output: # hack for p3dn's ignoring device name during volume attachment lsblk_output = task0.run('lsblk', return_output=True) if strip_dev(DEFAULT_UNIX_DEVICE) not in lsblk_output: actual_device = '/dev/nvme3n1' assert strip_dev(actual_device) in lsblk_output, f"Hack for p3dn failed, {actual_device} not found, " \ f"available devices '{lsblk_output}'" job.run( f'sudo mkdir -p /data && sudo chown `whoami` /data && sudo mount {actual_device} /data' ) while '/data' not in task0.run('df', return_output=True): time.sleep(ATTACH_WAIT_INTERVAL_SEC) print(f"Waiting for attachment")
def main(): if args.image_name == "pytorch.imagenet.source.v7": supported_regions = ["us-west-2", "us-east-1", "us-east-2"] if ncluster.get_region() not in supported_regions: raise ValueError( f"required AMI {args.image_name} has only been made available " f"in regions {supported_regions}, but your current region " f"is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)") if args.machines not in schedules: raise ValueError( f"{args.machines} not supported, only support {schedules.keys()}" ) if args.mount_imagenet: datadir = "/data/imagenet" else: datadir = "~/data/imagenet" os.environ["NCLUSTER_AWS_FAST_ROOTDISK"] = "1" # use io2 disk on AWS if args.num_tasks >= 16: if not args.simple_ring_setup: raise ValueError( "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" ) job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] # _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config["env_" + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f"/usr/local/cuda" EFA_HOME = f"/opt/amazon/efa" MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node if NPROC_PER_NODE > task0.num_gpus: raise ValueError( f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" ) NUM_GPUS = NPROC_PER_NODE * args.num_tasks config["NUM_GPUS"] = NUM_GPUS config["internal_id"] = u.get_account_number() config["internal_alias"] = u.get_account_name() config["region"] = u.get_region() config["zone"] = u.get_zone() config["launch_user"] = os.environ.get("USER", "") config["cmd"] = " ".join(sys.argv) config["launcher_conda"] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config["launcher_cmd"] = "python " + " ".join(sys.argv) config["logdir"] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: if not u.get_zone(): raise ValueError("Must specify zone when reusing EBS volumes") mount_imagenet(job) if not args.skip_setup: job.run( "rm -f *.py") # remove files backed into imagenet18 release image job.run("conda init") # missing .bashrc job.run( f"{{ source activate {args.conda_env} && " f"bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} " ) if args.pytorch_nightly: job.run( "conda install -y -c pytorch pytorch-nightly && bash setup.sh") else: job.run([ f"source ~/.bashrc && conda activate {args.conda_env}", f"killall python || echo hi" ]) job.rsync(".") if args.efa: if "efa" not in args.image_name: raise ValueError("make sure we use EFA-enabled image") unused_hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += "CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO " else: env_params += "NCCL_DEBUG=INFO " env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: if not args.pytorch_nightly: raise ValueError() env_params += " PYTORCH_USE_SPAWN=1 " if "WANDB_API_KEY" in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, "--fp16", "--logdir", job.logdir, "--name", f"{args.run_name}-{util.random_id()}", "--distributed", "--init-bn0", "--no-bn-wd", "--log_all_workers", args.log_all_workers, ] params = ["--phases", util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = " ".join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = ( f"--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} " f"--node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}" ) cmd = ( f"{env_params} python -m torch.distributed.launch {dist_params} " f"training/train_imagenet_nv.py {training_params}") task.run( f"echo {cmd} > {job.logdir}/task-{i}.cmd") # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = "efa" if args.pseudo_efa: FI_PROVIDER = "sockets" local_env = util.format_env_export( LOCAL_RANK="$OMPI_COMM_WORLD_LOCAL_RANK", RANK="$OMPI_COMM_WORLD_RANK", WORLD_SIZE="$OMPI_COMM_WORLD_SIZE", MASTER_ADDR=task0.ip, MASTER_PORT=6016, ) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f"{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64", NCCL_DEBUG="INFO", OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get("WANDB_API_KEY", ""), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = "training/env_test.py" else: worker_script_fn = "training/train_imagenet_nv.py" local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f"python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK", ] local_cmd = " ".join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f"{mpi_env} ", f"--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ", f"--bind-to none ", f"bash -c '{local_cmd}'", ] cmd = " ".join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")