Пример #1
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')

    script = os.path.basename(__file__)
    assert script in os.listdir('.')
    job = scluster.make_job(install_script='pip install ray',
                            image_name=args.image,
                            instance_type='c5.large',
                            num_tasks=2)
    job.upload(script)
    job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""
    ps, worker = job.tasks
    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
Пример #2
0
def test_multiple_logdir_tasks():
    n = 10
    dummy_task = scluster.make_task()
    logdir1 = scluster.get_logdir_root() + '/test1'
    dummy_task.run(f'rm -Rf {logdir1}')
    job = scluster.make_job(run_name='test1', num_tasks=n)

    obtained_logdirs = []

    import wrapt

    @wrapt.synchronized
    def query(i):
        obtained_logdirs.append(job.tasks[i].logdir)

    threads = [threading.Thread(target=query, args=(i, )) for i in range(n)]
    for thread in reversed(threads):
        thread.start()

    random.shuffle(threads)
    for thread in threads:
        thread.join()

    assert len(set(obtained_logdirs)) == 1
    assert obtained_logdirs[0] == logdir1
Пример #3
0
def run_launcher():
  import scluster
  if args.aws:
    scluster.set_backend('aws')

  job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not scluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if scluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
Пример #4
0
def launcher():
  import scluster
  
  if args.aws:
    scluster.set_backend('aws')

  job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  if args.aws:
    job.run('source activate pytorch_p36')
  else:
    job.run('source deactivate')
    job.run('source activate ncluster-test3')

  script_name = os.path.basename(__file__)
  common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
  job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args,
                   non_blocking=True)
  job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args,
                   non_blocking=True)

  job.tasks[0].join()
  print(job.tasks[0].read('out'))
Пример #5
0
def main():
  scluster.set_backend('local')

  job = scluster.make_job(num_tasks=2)

  start_time = time.time()
  job.run('sleep 1')
  print(f"waited for {time.time()-start_time} seconds")
Пример #6
0
def main():
    scluster.set_backend('aws')

    start_time = time.time()
    job = scluster.make_job(num_tasks=16)
    print(f"waited for startup for {time.time()-start_time} seconds")

    start_time = time.time()
    job.run('sleep 10')
    print(f"waited for exec for {time.time()-start_time} seconds")
Пример #7
0
def run_launcher():
    import scluster
    if args.aws:
        scluster.set_backend('aws')

    job = scluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
    job.upload(__file__)

    sender, receiver = job.tasks
    if scluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
    receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
                 non_blocking=True)
    sender.run(
        f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
Пример #8
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        print(f"asdfasdf {util.ossystem('uname')}")
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
            print(f"asdfasdf got install script {install_script}")
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = scluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=2)
    ps, worker = job.tasks
    if not scluster.running_locally():
        ps._run_raw('killall python', ignore_errors=True)
        worker._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
Пример #9
0
def run_launcher():
    import scluster
    if args.aws:
        scluster.set_backend('aws')

    job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not scluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if scluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
Пример #10
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')
    job = scluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image)
    job.upload(__file__)
    this_file = os.path.basename(__file__)

    sender, receiver = job.tasks
    if scluster.get_backend() == 'aws':
        # on AWS probably are running in DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
    job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}',
                     non_blocking=True)
    job.tasks[0].run(
        f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
    job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..',
                     non_blocking=True)
    print(
        f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")