def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') script = os.path.basename(__file__) assert script in os.listdir('.') job = scluster.make_job(install_script='pip install ray', image_name=args.image, instance_type='c5.large', num_tasks=2) job.upload(script) job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps, worker = job.tasks ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' )
def test_multiple_logdir_tasks(): n = 10 dummy_task = scluster.make_task() logdir1 = scluster.get_logdir_root() + '/test1' dummy_task.run(f'rm -Rf {logdir1}') job = scluster.make_job(run_name='test1', num_tasks=n) obtained_logdirs = [] import wrapt @wrapt.synchronized def query(i): obtained_logdirs.append(job.tasks[i].logdir) threads = [threading.Thread(target=query, args=(i, )) for i in range(n)] for thread in reversed(threads): thread.start() random.shuffle(threads) for thread in threads: thread.join() assert len(set(obtained_logdirs)) == 1 assert obtained_logdirs[0] == logdir1
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') sender, receiver = job.tasks # kill python just for when tmux session reuse is on if not scluster.running_locally(): sender._run_raw('killall python', ignore_errors=True) receiver._run_raw('killall python', ignore_errors=True) if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) sender.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}') print(sender.read('out'))
def launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') if args.aws: job.run('source activate pytorch_p36') else: job.run('source deactivate') job.run('source activate ncluster-test3') script_name = os.path.basename(__file__) common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}' job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args, non_blocking=True) job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args, non_blocking=True) job.tasks[0].join() print(job.tasks[0].read('out'))
def main(): scluster.set_backend('local') job = scluster.make_job(num_tasks=2) start_time = time.time() job.run('sleep 1') print(f"waited for {time.time()-start_time} seconds")
def main(): scluster.set_backend('aws') start_time = time.time() job = scluster.make_job(num_tasks=16) print(f"waited for startup for {time.time()-start_time} seconds") start_time = time.time() job.run('sleep 10') print(f"waited for exec for {time.time()-start_time} seconds")
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job('tf_adder', num_tasks=2, image_name=args.image) job.upload(__file__) sender, receiver = job.tasks if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' receiver.run(f'python tf_adder.py --role=receiver {ip_config}', non_blocking=True) sender.run( f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS print(f"asdfasdf {util.ossystem('uname')}") if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' print(f"asdfasdf got install script {install_script}") else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' job = scluster.make_job(name=args.name, install_script=install_script, image_name=args.image, num_tasks=2) ps, worker = job.tasks if not scluster.running_locally(): ps._run_raw('killall python', ignore_errors=True) worker._run_raw('killall python', ignore_errors=True) job.upload(__file__) job.upload('util.py') if args.xray: job.run('export RAY_USE_XRAY=1') job.run('ray stop') # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources ps_resource = """--resources='{"ps": 1}'""" worker_resource = """--resources='{"worker": 1}'""" ps.run(f"ray start --head {ps_resource} --redis-port=6379") worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}") worker.run( f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}' ) print(worker.read('out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job(args.name, num_tasks=2, image_name=args.image) job.upload(__file__) job.upload('util.py') # kill python just for when tmux session reuse is on if not scluster.running_locally(): job._run_raw('killall python', ignore_errors=True) if scluster.get_backend() == 'aws': # on AWS probably running in conda DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') hosts = [task.public_ip for task in job.tasks] host_str = ','.join(hosts) os.system( f'mpirun -np 2 --host {host_str} python {__file__} --role=worker') print(job.tasks[0].read('/tmp/out'))
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') job = scluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image) job.upload(__file__) this_file = os.path.basename(__file__) sender, receiver = job.tasks if scluster.get_backend() == 'aws': # on AWS probably are running in DLAMI, switch into TF-enabled env job.run('source activate tensorflow_p36') ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}' job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True) job.tasks[0].run( f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}') job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True) print( f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")