def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') script = os.path.basename(__file__) assert script in os.listdir('.') job = scluster.make_job(install_script='pip install ray', image_name=args.image, instance_type='c5.large', num_tasks=2) job.upload(script) job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps, worker = job.tasks ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' )
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') sender, receiver = job.tasks # kill python just for when tmux session reuse is on if not scluster.running_locally(): sender._run_raw('killall python', ignore_errors=True) receiver._run_raw('killall python', ignore_errors=True) if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) sender.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(sender.read('out'))
def run_launcher(): import scluster scluster.util.assert_script_in_current_directory() if args.aws: scluster.set_backend('aws') # use 4GB instance, 0.5GB not enough worker = scluster.make_task(args.name, image_name=args.image, instance_type='t3.medium') worker.upload(__file__) worker.upload('util.py') # kill python just for when tmux session reuse is on if not scluster.running_locally(): # on AWS probably running in conda DLAMI, switch into TF-enabled env worker._run_raw('killall python', ignore_errors=True) worker.run('source activate tensorflow_p36') ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' worker.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) worker.switch_window(1) # run in new tmux window if not scluster.running_locally(): worker.run('source activate tensorflow_p36') worker.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}' ) print(worker.read('out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' worker = scluster.make_task(name=args.name, install_script=install_script, image_name=args.image) if not scluster.running_locally(): worker._run_raw('killall python', ignore_errors=True) worker.upload(__file__) worker.upload('util.py') if args.xray: worker.run('export RAY_USE_XRAY=1') worker.run('ray stop') resources = """--resources='{"ps": 1, "worker": 1}'""" worker.run(f"ray start --head {resources} --redis-port=6379") # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") worker.run( f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') print(worker.read('out'))
def launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') if args.aws: job.run('source activate pytorch_p36') else: job.run('source deactivate') job.run('source activate ncluster-test3') script_name = os.path.basename(__file__) common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args, non_blocking=True) job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args, non_blocking=True) job.tasks[0].join() print(job.tasks[0].read('out'))
def main(): scluster.set_backend('local') job = scluster.make_job(num_tasks=2) start_time = time.time() job.run('sleep 1') print(f"waited for {time.time()-start_time} seconds")
def main(): scluster.set_backend('aws') start_time = time.time() job = scluster.make_job(num_tasks=16) print(f"waited for startup for {time.time()-start_time} seconds") start_time = time.time() job.run('sleep 10') print(f"waited for exec for {time.time()-start_time} seconds")
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job('tf_adder', num_tasks=2, image_name=args.image) job.upload(__file__) sender, receiver = job.tasks if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python tf_adder.py --role=receiver {ip_config}', non_blocking=True) sender.run( f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS print(f"asdfasdf {util.ossystem('uname')}") if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' print(f"asdfasdf got install script {install_script}") else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = scluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=2) ps, worker = job.tasks if not scluster.running_locally(): ps._run_raw('killall python', ignore_errors=True) worker._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' ) print(worker.read('out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not scluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') hosts = [task.public_ip for task in job.tasks] host_str = ','.join(hosts) os.system( f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image) job.upload(__file__) this_file = os.path.basename(__file__) sender, receiver = job.tasks if scluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) job.tasks[0].run( f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) print( f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
help="instance name") parser.add_argument('--image-name', type=str, default='Deep Learning AMI (Ubuntu) Version 15.0', help="name of AMI to use ") parser.add_argument('--instance-type', type=str, default='p3.2xlarge', help="type of instance") parser.add_argument('--password', default='DefaultNotebookPasswordPleaseChange', help='password to use for jupyter notebook') parser.add_argument("--aws", action="store_true", help="enable to run on AWS") args = parser.parse_args() module_path = os.path.dirname(os.path.abspath(__file__)) if args.aws: scluster.set_backend('aws') def main(): task = scluster.make_task(name=args.name, instance_type=args.instance_type, image_name=args.image_name) # upload notebook config with provided password jupyter_config_fn = _create_jupyter_config(args.password) remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' task.upload(jupyter_config_fn, remote_config_fn) # upload sample notebook and start Jupyter server task.run('mkdir -p /ncluster/notebooks') task.upload(f'{module_path}/gpubox_sample.ipynb', '/ncluster/notebooks/gpubox_sample.ipynb',