def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') script = os.path.basename(__file__) assert script in os.listdir('.') job = ncluster.make_job(install_script='pip install ray', image_name=args.image, instance_type='c5.large', num_tasks=2) job.upload(script) job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps, worker = job.tasks ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' )
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') sender, receiver = job.tasks # kill python just for when tmux session reuse is on if not ncluster.running_locally(): sender._run_raw('killall python', ignore_errors=True) receiver._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) sender.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(sender.read('out'))
def launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') if args.aws: job.run('source activate pytorch_p36') else: job.run('source deactivate') job.run('source activate ncluster-test3') script_name = os.path.basename(__file__) common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' job.tasks[0].run(f'python {script_name} --role=worker --rank=0 ' + common_args, non_blocking=True) job.tasks[1].run(f'python {script_name} --role=worker --rank=1 ' + common_args, non_blocking=True) job.tasks[0].join() print(job.tasks[0].read('out'))
def main(): print('start job ...') start_time = time.time() # 1. create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-shanghai', 'cn-zhangjiakou', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, instance_type=INSTANCE_TYPE, disable_nas=True, spot=True, install_script='') init_ncluster = time.time() print('init ncluster:', init_ncluster - start_time) # 2. upload GTC code job.run('yum install -y unzip') job.upload('GTC') job.run( 'cd GTC && wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/dataset.zip ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/test.JPG ' + '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/resnet50-19c8e357.pth ' + '&& conda activate torch_1.3_cu10.0_py36') upload_data = time.time() print('upload_data time:', upload_data - init_ncluster) # 3. prepare the dataset job.run('unzip -o dataset.zip') unzip_time = time.time() print('unzip data:', unzip_time - upload_data) # 4. run the training job job.tasks[0].run('conda activate torch_1.3_cu10.0_py36') job.tasks[0].run('./run-perseus.sh 2>&1 | tee logs.log', non_blocking=False) train_time = time.time() print('training time:', train_time - unzip_time) # 5. run the inference job job.tasks[0].run('python inference.py 2>&1 | tee logs.inference.log', non_blocking=False) print('inference time:', time.time() - train_time) eclapse_time = time.time() - start_time print(f'training and inference deploy time is: {eclapse_time} s.') # 6. stop the instance (optional) job.stop()
def test_multiple_logdir_tasks(): n = 10 dummy_task = ncluster.make_task() logdir1 = ncluster.get_logdir_root() + '/test1' dummy_task.run(f'rm -Rf {logdir1}') job = ncluster.make_job(run_name='test1', num_tasks=n) obtained_logdirs = [] import wrapt @wrapt.synchronized def query(i): obtained_logdirs.append(job.tasks[i].logdir) threads = [threading.Thread(target=query, args=(i,)) for i in range(n)] for thread in reversed(threads): thread.start() random.shuffle(threads) for thread in threads: thread.join() assert len(set(obtained_logdirs)) == 1 assert obtained_logdirs[0] == logdir1
def run_launcher(): import ncluster if args.nightly: install_script = 'pip install --no-cache-dir -U ray --find-links ' \ 'https://s3-us-west-2.amazonaws.com/ray-wheels/latest/' else: install_script = 'pip install -U ray' if args.local: ncluster.set_backend('local') job = ncluster.make_job(**vars(args)) job.run(install_script) ps, worker = job.tasks if not ncluster.running_locally(): ps.run('killall python || echo no python found') worker.run('killall || echo no python found') job.run('ray stop') job.upload(__file__) job.upload('util.py') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run(f'python {__file__} --role=driver --ip={ps.ip}:6379 ' f'--hidden_size={args.hidden_size} --num_layers={args.num_layers} ' f'--iters={args.iters}') print(worker.read('out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') script = os.path.basename(__file__) if args.nightly: if args.macos: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = ncluster.make_job(name=args.name, install_script=install_script, image_name=args.image, instance_type=args.instance, num_tasks=args.num_workers + 1) job.upload(script) if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources driver = job.tasks[0] driver.run(f"ray start --head --redis-port=6379") for worker_task in job.tasks[1:]: worker_resource = """--resources='{"worker": 1}'""" worker_task.run(f"ray start --redis-address={driver.ip}:6379 " f"{worker_resource}") driver.run(f'./{script} --role=driver --ip={driver.ip}:6379')
def main(): ncluster.set_backend('local') job = ncluster.make_job(num_tasks=2) start_time = time.time() job.run('sleep 1') print(f"waited for {time.time()-start_time} seconds")
def main(): ncluster.set_backend('aws') start_time = time.time() job = ncluster.make_job(num_tasks=16) print(f"waited for startup for {time.time()-start_time} seconds") start_time = time.time() job.run('sleep 10') print(f"waited for exec for {time.time()-start_time} seconds")
def launcher(): # run this test out of root directory of ncluster to capture .git and requirements.txt script_fn = 'tests/integration_test.py' import ncluster job = ncluster.make_job(**vars(args)) job.rsync('.') job.run('pip install -r requirements.txt') task0 = job.tasks[0] task0.run( f'python {script_fn} --role=worker --name={args.name}-{random_id()} --local_rank=0', stream_output=True)
def run_launcher(): import ncluster job = ncluster.make_job('tf_adder', num_tasks=2) job.upload(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python tf_adder.py --role=receiver {ip_config}', async=True) job.tasks[0].run(f'python tf_adder.py --role=sender {ip_config}')
def launcher(): job = ncluster.make_job(**vars(args)) job.run('sudo apt install -y iperf3 nload') tasks = job.tasks for i in range(args.num_procs): ip = tasks[0].ip port = 6006+i tag = f"s{i}" tasks[0].switch_window(i) tasks[0].run(f'sudo iperf3 -s -p {port}', non_blocking=True) tasks[1].switch_window(i) tasks[1].run(f'sudo iperf3 -T {tag} -c {ip} -P {args.flows_per_proc} -i 1 -t {args.duration_sec} -V -p {port}', non_blocking=True)
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE) job.rsync('.') # job.upload('setup.sh') # job.upload('worker_requirements.txt') # todo(y): replace with rsync job.run('bash setup.sh') # job.upload('training') job.run(f'source activate pytorch_source') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ '~/data/imagenet', '--fp16', '--logdir', job.logdir, '--name', args.name, '--distributed', '--init-bn0', '--no-bn-wd', ] params = ['--phases', schedules[args.machines]] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")
def main(): assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" # since we are using configurable name of conda env, modify install script # to run in that conda env install_script = open(INSTALL_SCRIPT_FN).read() install_script = f'source activate {ENV_NAME}\n' + install_script os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' job = ncluster.make_job(name=args.name, run_name=args.name, num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, install_script=install_script, preemptible=args.preemptible) job.upload('training') job.run(f'source activate {ENV_NAME}') world_size = NUM_GPUS * args.machines nccl_args = launch_utils_lib.get_nccl_args(args.machines, NUM_GPUS) # Training script args default_params = [ '~/data/imagenet', '--fp16', '--logdir', job.logdir, '--distributed', '--init-bn0', '--no-bn-wd', ] params = ['--phases', schedules[args.machines]] training_args = default_params + params training_args = ' '.join(map(launch_utils_lib.format_args, training_args)) # TODO: simplify args processing # Run tasks task_cmds = [] for i, task in enumerate(job.tasks): dist_args = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_args} python -m torch.distributed.launch {dist_args} training/train_imagenet_nv.py {training_args}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, async=True) print(f"Logging to {job.logdir}")
def main(): # job launches are asynchronous, can spin up multiple jobs in parallel job = ncluster.make_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT) job.join() head_task = job.tasks[0] head_task.run(f"ray start --head --redis-port={DEFAULT_PORT}") slave_task = job.tasks[1] slave_task.run("ray start --redis-address {head_task.ip}:{DEFAULT_PORT}") script_name = os.path.basename(BENCHMARK_URL) slave_task.run("rm -f " + script_name) slave_task.run("wget " + BENCHMARK_URL) slave_task.run("python " + script_name) print("To see results:") print(slave_task.connect_instructions)
def run_launcher(): import ncluster job = ncluster.make_job('tf_adder_tb', num_tasks=2) job.upload(__file__) this_file = os.path.basename(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image) job.upload(__file__) sender, receiver = job.tasks if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python tf_adder.py --role=receiver {ip_config}', non_blocking=True) sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
def main(): job = ncluster.make_job(name=args.name, instance_type=args.instance_type, num_tasks=2, disk_size=1000, image_name=args.image_name) public_keys = {} for task in job.tasks: key_fn = '~/.ssh/id_rsa' task.run(f"yes | ssh-keygen -t rsa -f {key_fn} -N ''") public_keys[task] = task.read(key_fn + '.pub') for task1 in job.tasks: task1.run( """sudo bash -c 'echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config'""" ) for task2 in job.tasks: # allow passwordless SSH from task1 to task2 task2.run(f'echo "{public_keys[task1]}" >> ~/.ssh/authorized_keys')
def launcher(): job = ncluster.make_job(name=args.name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, spot=not args.nospot) print(f"Logging to {job.logdir}") tasks = job.tasks for i in range(args.num_procs): ip = tasks[0].ip port = 6006 + i tag = f"s{i}" tasks[0].switch_window(i) tasks[0].run(f'sudo iperf3 -s -p {port}', non_blocking=True) tasks[1].switch_window(i) tasks[1].run( f'sudo iperf3 -T {tag} -c {ip} -P {args.flows_per_proc} -i 1 -t {args.duration_sec} -V -p {port}', non_blocking=True)
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, ) job.upload('training') if ncluster.get_region() == 'local': job.run('conda activate main') # specific to image else: job.run(f'conda activate fastai') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ # '--load', f'/ncluster/models/{args.name}.pth', # '--save', f'/ncluster/models/{args.name}.pth' ] # params = ['--phases', schedules[args.machines]] params = [] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) train_script = 'training/train.py' # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}' # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True)
def main(): supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local'] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}" job = ncluster.make_job( name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, ) job.upload('src') job.upload('scripts') job.run(f'conda activate midi') job.run('cd scripts') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ # '--load', f'/ncluster/models/{args.name}.pth', '--path', '~/data/midi/v10/midi_encode/' ] params = [ '--save', f'large_single/lq/1_ep44', '--cache', 'tmp/lq', '--batch_size', '8', '--large', '--single_stream', '--epochs', '44', '--lr', '.008' ] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) train_script = 'run_txl_npenc.py' # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}' # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True)
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = ncluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=args.num_workers + args.num_ps) if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') head = job.tasks[0] # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources worker_resource = """--resources='{"worker": 1}'""" head.run(f"ray start --head {worker_resource} --redis-port=6379") for task in job.tasks[1:]: task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}") head.run( f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}' ) print(head.read('out'))
def run_launcher(): import ncluster if args.aws: ncluster.set_backend('aws') job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') hosts = [task.public_ip for task in job.tasks] host_str = ','.join(hosts) os.system( f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def launcher(): job = ncluster.make_job('worker', instance_type=args.instance_type, num_tasks=2) job.join() print("Job ready for connection, to connect to most recent task:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() job.upload(__file__) job.run('source activate pytorch_p36') script_name = os.path.basename(__file__) job.tasks[0].run_async( f'python {script_name} --internal-role=worker --rank=0 --size=2 --master-addr={job.tasks[0].ip}' ) job.tasks[1].run_async( f'python {script_name} --internal-role=worker --rank=1 --size=2 --master-addr={job.tasks[0].ip}' )
def main(): assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, image_name=IMAGE_NAME, instance_type=INSTANCE_TYPE, install_script=open('setup.sh').read()) job.upload('training') job.run(f'source activate pytorch_source') nccl_params = get_nccl_params(args.machines, NUM_GPUS) # Training script args default_params = [ '~/data/imagenet', '--fp16', '--logdir', job.logdir, '--distributed', '--init-bn0', '--no-bn-wd', ] params = ['--phases', schedules[args.machines]] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, async=True) print(f"Logging to {job.logdir}")
def run_launcher(): import ncluster job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image, instance_type=args.instance_type) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not ncluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if ncluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env # TODO(y) switch to PyTorch enabled job.run('source activate tensorflow_p36') # TODO(y): this should be private ip hosts = [task.ip for task in job.tasks] host_str = ','.join(hosts) os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def launcher(): # todo: flag for skip setup import ncluster job = ncluster.make_job(**vars(args)) print(f"Logging to {job.logdir}") nccl_params = _get_nccl_params() # pass through launcher params to worker script assert '--role=launcher' in sys.argv, "how did you get here?" worker_params = sys.argv[1:] worker_params.remove('--role=launcher') worker_params.extend([f'--logdir {job.logdir}']) worker_params = ' '.join(worker_params) # pass through all args dist_params0 = (f'--nproc_per_node={args.nproc_per_node} ' f'--nnodes={args.num_tasks} ' f'--master_addr={job.tasks[0].ip} ' f'--master_port={6016} ') job.rsync('.') worker_script_fn = os.path.basename(__file__) # remote location job.run(f'killall -9 python || echo skipping && source activate {args.conda_env}') for i, task in enumerate(job.tasks): dist_params = dist_params0 + f'--node_rank={i} ' cmd = (f'{nccl_params} python -m torch.distributed.launch {dist_params} {worker_script_fn} ' f'{worker_params} ') task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) job.tasks[0].join() print(job.tasks[0].output)
def main(): start_time = time.time() # 1. Create infrastructure supported_regions = [ 'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou', 'cn-beijing' ] assert ncluster.get_region( ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)" ncluster_globals.set_should_disable_nas(True) job = ncluster.make_job(name=args.name, run_name=f"{args.name}-{args.machines}", num_tasks=args.machines, disable_nas=True, spot=True, instance_type=INSTANCE_TYPE) # 2. Upload perseus bert code. job.run('yum install -y unzip') job.upload('perseus-bert') job.run('conda activate tensorflow_1.14_cu10.0_py36') # 3. Download pretrain model and dataset. BERT_CHINESE_BASE_DIR = '/root/chinese_L-12_H-768_A-12' DATA_DIR = '/root/toutiao_data' job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/chinese_L-12_H-768_A-12.zip && unzip chinese_L-12_H-768_A-12.zip' ) job.run( 'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/toutiao_data.tgz && tar xvf toutiao_data.tgz' ) # 4. Run the training job. job.run('cd perseus-bert') hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks] host_str = ','.join(hosts) mpi_cmd = [ 'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}', f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none', '-x NCCL_DEBUG=INFO', '-x PATH', '-x PYTHONPATH', '-x LD_LIBRARY_PATH', '-x XLA_FLAGS' ] bert_classifier_cmd = [ 'python run_classifier.py', '--task_name=news', '--do_train=true', '--do_eval=true', f'--data_dir={DATA_DIR}', f'--vocab_file={BERT_CHINESE_BASE_DIR}/vocab.txt', f'--bert_config_file={BERT_CHINESE_BASE_DIR}/bert_config.json', f'--init_checkpoint={BERT_CHINESE_BASE_DIR}/bert_model.ckpt', '--max_seq_length=128', '--train_batch_size=48', '--learning_rate=8e-5', '--num_train_epochs=3.0', '--warmup_proportion=0.8', '--output_dir=/root/output_dir', '--use_amp=true', '--use_perseus=true', '--use_xla=true' ] cmd = mpi_cmd + bert_classifier_cmd cmd = " ".join(cmd) job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd') job.tasks[0].run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") eclapse_time = time.time() - start_time print(f'training deploy time is: {eclapse_time} s.') job.stop()
def main(): config = AttrDefault(lambda: None, config_defaults) assert args.config in globals(), f"unknown config {args.config}" config.update(eval(args.config)) job = ncluster.make_job(name=args.name, run_name=f"{args.name}", num_tasks=config.machines, image_name=config.image_name, instance_type=config.instance_type, spot=not args.nospot, skip_setup=args.skip_setup) job.rsync('.') job.run(f'killall python || echo failed && ' # kill previous run f'source activate {config.conda_env} && ' + f'pip install -r requirements.txt') instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type] num_gpus_per_machine = instance_info['gpus'] total_gpus = num_gpus_per_machine * config.machines global_batch_size = config.batch_size * total_gpus # linear LR scaling (https://arxiv.org/abs/1706.02677) lr = config.base_lr * (global_batch_size / BASE_LR_BATCHSIZE) # TODO(y): change dataset location to /data/transformer-xl-data after # image is cut # worker parameters with training setup worker_params = { 'seed': 1111, 'data': 'data/wikitext-103', 'dataset': 'wt103', 'adaptive': True, 'log_interval': 100, 'eval_interval': 1000, 'logdir': job.logdir, 'lr': lr, 'fp16': True, 'dynamic_loss_scale': True, 'batch_size': config.batch_size, } if config.architecture == 'wt103_large': worker_params.update(wt103_large) elif config.architecture == 'wt103_base': worker_params.update(wt103_base) else: assert False, f"Uknown architecture {config.architecture}" nccl_params = f'NCCL_DEBUG=VERSION NCCL_MIN_NRINGS={config.num_rings} ' for i, task in enumerate(job.tasks): dist_params = \ f'--nproc_per_node={num_gpus_per_machine} ' \ f'--nnodes={config.machines} --node_rank={i} ' \ f'--master_addr={job.tasks[0].ip} --master_port={6016} ' cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} ' \ f'train.py {util.dict_to_args(worker_params)}' task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}") if args.launch_tensorboard: task = ncluster.make_task('tensorboard', instance_type='r5.large', image_name=args.image_name) task.run('source activate tensorflow_p36') task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006', non_blocking=True) print(f'TensorBoard at http://{task.public_ip}:6006')
def main(): if args.image_name == 'pytorch.imagenet.source.v7': supported_regions = ['us-west-2', 'us-east-1', 'us-east-2'] assert ncluster.get_region( ) in supported_regions, f"required AMI {args.image_name} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)" assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}" if args.mount_imagenet: datadir = '/data/imagenet' else: datadir = '~/data/imagenet' os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1' # use io2 disk on AWS if args.num_tasks >= 16: assert args.simple_ring_setup, "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit" job = ncluster.make_job( name=args.name, run_name=args.run_name, num_tasks=args.machines, image_name=args.image_name, instance_type=args.instance_type, disk_size=500, spot=args.spot, skip_setup=args.skip_setup, ) task0 = job.tasks[0] _logdir = task0.logdir # workaround for race condition in creating logdir config = {} for key in os.environ: if re.match(r"^NCLUSTER", key): config['env_' + key] = os.getenv(key) config.update(vars(args)) CUDA_HOME = f'/usr/local/cuda' EFA_HOME = f'/opt/amazon/efa' MPI_HOME = EFA_HOME NPROC_PER_NODE = args.nproc_per_node assert NPROC_PER_NODE <= task0.num_gpus, f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present" NUM_GPUS = NPROC_PER_NODE * args.num_tasks config['NUM_GPUS'] = NUM_GPUS config['internal_id'] = u.get_account_number() config['internal_alias'] = u.get_account_name() config['region'] = u.get_region() config['zone'] = u.get_zone() config['launch_user'] = os.environ.get('USER', '') config['cmd'] = ' '.join(sys.argv) config['launcher_conda'] = util.ossystem( 'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}') config['launcher_cmd'] = 'python ' + ' '.join(sys.argv) config['logdir'] = job.logdir pickled_config = util.text_pickle(config) if args.log_all_workers: job.write(args.internal_config_fn, pickled_config) else: job.tasks[0].write(args.internal_config_fn, pickled_config) if args.mount_imagenet: assert u.get_zone(), "Must specify zone when reusing EBS volumes" mount_imagenet(job) if not args.skip_setup: job.run( 'rm -f *.py') # remove files backed into imagenet18 release image job.run('conda init') # missing .bashrc job.run( f'{{ source activate {args.conda_env} && bash setup.sh && pip install -U protobuf ; }} && {{ killall python || echo hi ; }} ' ) if args.pytorch_nightly: job.run( 'conda install -y -c pytorch pytorch-nightly && bash setup.sh') else: job.run([ f'source ~/.bashrc && conda activate {args.conda_env}', f'killall python || echo hi' ]) job.rsync('.') if args.efa: assert 'efa' in args.image_name # make sure we use EFA-enabled image hosts_str, hosts_file_str = util.setup_mpi( job, skip_ssh_setup=args.skip_setup) if not args.skip_setup: task0.write(HOSTS_SLOTS_FN, hosts_file_str) env_params = get_nccl_params(args.machines, args.nproc_per_node) if args.cuda_debug: env_params += 'CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO ' else: env_params += 'NCCL_DEBUG=INFO ' env_params += " OMP_NUM_THREADS=1 " if args.pytorch_use_spawn: assert args.pytorch_nightly env_params += " PYTORCH_USE_SPAWN=1 " if 'WANDB_API_KEY' in os.environ: env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} " # Training script args default_params = [ datadir, '--fp16', '--logdir', job.logdir, '--name', f'{args.run_name}-{util.random_id()}', '--distributed', '--init-bn0', '--no-bn-wd', '--log_all_workers', args.log_all_workers, ] params = ['--phases', util.text_pickle(schedules[args.machines])] training_params = default_params + params training_params = ' '.join(map(format_params, training_params)) if not args.efa: # TODO: simplify args processing, or give link to actual commands run for i, task in enumerate(job.tasks): dist_params = f'--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}' cmd = f'{env_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}' task.run( f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line task.run(cmd, non_blocking=True) else: FI_PROVIDER = 'efa' if args.pseudo_efa: FI_PROVIDER = 'sockets' local_env = util.format_env_export( LOCAL_RANK='$OMPI_COMM_WORLD_LOCAL_RANK', RANK='$OMPI_COMM_WORLD_RANK', WORLD_SIZE='$OMPI_COMM_WORLD_SIZE', MASTER_ADDR=task0.ip, MASTER_PORT=6016) mpi_env = util.format_env_x( FI_PROVIDER= FI_PROVIDER, # Enables running nccl-tests using EFA provider. FI_OFI_RXR_RX_COPY_UNEXP= 1, # Disables using bounce buffers for unexpected messages. FI_OFI_RXR_RX_COPY_OOO= 1, # Disables using bounce buffers for out of order messages. FI_EFA_MR_CACHE_ENABLE=1, # Enables memory region caching. FI_OFI_RXR_INLINE_MR_ENABLE= 1, # Enables inline memory registration of data buffers. NCCL_TREE_THRESHOLD=10 * 4294967296, # force tree for everything under 40GB LD_LIBRARY_PATH= f'{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64', NCCL_DEBUG='INFO', OMP_NUM_THREADS=1, WANDB_API_KEY=os.environ.get('WANDB_API_KEY', ''), PYTORCH_USE_SPAWN=args.pytorch_use_spawn, NO_WANDB=args.pytorch_use_spawn, ) if args.no_op: worker_script_fn = 'training/env_test.py' else: worker_script_fn = 'training/train_imagenet_nv.py' local_cmd = [ f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ", f'python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK' ] local_cmd = ' '.join(local_cmd) cmd = [ f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ", f'{mpi_env} ', f'--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ', f'--bind-to none ', f"bash -c '{local_cmd}'" ] cmd = ' '.join(cmd) task0.run(cmd, non_blocking=True) print(f"Logging to {job.logdir}")