def test_multiple_logdirs(): logdir1 = scluster.get_logdir_root() + '/test1' dummy_task = scluster.make_task() dummy_task.run(f'rm -Rf {logdir1}') task1 = scluster.make_task(run_name='test1') assert task1.logdir == logdir1 logdir2 = scluster.get_logdir_root() + '/test2' task2 = scluster.make_task(run_name='test2') dummy_task.run(f'rm -Rf {logdir2}*') dummy_task.run(f'mkdir {logdir2}') assert task2.logdir == logdir2 + '.01'
def test_multiple_logdir_tasks(): n = 10 dummy_task = scluster.make_task() logdir1 = scluster.get_logdir_root() + '/test1' dummy_task.run(f'rm -Rf {logdir1}') job = scluster.make_job(run_name='test1', num_tasks=n) obtained_logdirs = [] import wrapt @wrapt.synchronized def query(i): obtained_logdirs.append(job.tasks[i].logdir) threads = [threading.Thread(target=query, args=(i, )) for i in range(n)] for thread in reversed(threads): thread.start() random.shuffle(threads) for thread in threads: thread.join() assert len(set(obtained_logdirs)) == 1 assert obtained_logdirs[0] == logdir1
def run_launcher(): import scluster if args.aws: scluster.set_backend('aws') if args.nightly: # running locally MacOS if 'Darwin' in util.ossystem('uname') and not args.aws: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl' else: install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl' else: install_script = 'pip install ray' worker = scluster.make_task(name=args.name, install_script=install_script, image_name=args.image) if not scluster.running_locally(): worker._run_raw('killall python', ignore_errors=True) worker.upload(__file__) worker.upload('util.py') if args.xray: worker.run('export RAY_USE_XRAY=1') worker.run('ray stop') resources = """--resources='{"ps": 1, "worker": 1}'""" worker.run(f"ray start --head {resources} --redis-port=6379") # worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}") worker.run( f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}') print(worker.read('out'))
def run_launcher(): import scluster scluster.util.assert_script_in_current_directory() if args.aws: scluster.set_backend('aws') # use 4GB instance, 0.5GB not enough worker = scluster.make_task(args.name, image_name=args.image, instance_type='t3.medium') worker.upload(__file__) worker.upload('util.py') # kill python just for when tmux session reuse is on if not scluster.running_locally(): # on AWS probably running in conda DLAMI, switch into TF-enabled env worker._run_raw('killall python', ignore_errors=True) worker.run('source activate tensorflow_p36') ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}' worker.run(f'python {__file__} --role=receiver {ip_config}', non_blocking=True) worker.switch_window(1) # run in new tmux window if not scluster.running_locally(): worker.run('source activate tensorflow_p36') worker.run( f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}' ) print(worker.read('out'))
def test(): task = scluster.make_task( image_name=scluster.aws_backend.GENERIC_SMALL_IMAGE) task.run("mkdir /illegal", non_blocking=True) task.join(ignore_errors=True) # this succeed/print error message task.run("mkdir /illegal", non_blocking=True) with pytest.raises(RuntimeError): task.join() # this should fail
def main(): task = scluster.make_task(name=args.name, instance_type=args.instance_type, image_name=args.image_name) # upload notebook config with provided password jupyter_config_fn = _create_jupyter_config(args.password) remote_config_fn = '~/.jupyter/jupyter_notebook_config.py' task.upload(jupyter_config_fn, remote_config_fn) # upload sample notebook and start Jupyter server task.run('mkdir -p /ncluster/notebooks') task.upload(f'{module_path}/gpubox_sample.ipynb', '/ncluster/notebooks/gpubox_sample.ipynb', dont_overwrite=True) task.run('cd /ncluster/notebooks') task.run('jupyter notebook', non_blocking=True) print(f'Jupyter notebook will be at http://{task.public_ip}:8888')
#!/bin/env python import sys if not sys.argv[1:]: import scluster task = scluster.make_task(instance_type='t3.micro') task.upload(__file__) task.run('pip install tensorflow') task.run(f'python {__file__} worker') elif sys.argv[1] == 'worker': import tensorflow as tf import os sess = tf.Session() ones = tf.ones((1000, 1000)) result = sess.run(tf.matmul(ones, ones)) print(f"matmul gave {result.sum()}") os.system('sudo shutdown -h -P 10') # shut down the instance in 10 mins
#!/usr/bin/env python import scluster # allocate default machine type and default image task = scluster.make_task() output = task.run('ifconfig') print(f"Task ifconfig returned {output}")