Пример #1
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')

    script = os.path.basename(__file__)
    assert script in os.listdir('.')
    job = scluster.make_job(install_script='pip install ray',
                            image_name=args.image,
                            instance_type='c5.large',
                            num_tasks=2)
    job.upload(script)
    job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""
    ps, worker = job.tasks
    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
Пример #2
0
def run_launcher():
  import scluster
  if args.aws:
    scluster.set_backend('aws')

  job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not scluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if scluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
Пример #3
0
def run_launcher():
    import scluster
    scluster.util.assert_script_in_current_directory()

    if args.aws:
        scluster.set_backend('aws')

    # use 4GB instance, 0.5GB not enough
    worker = scluster.make_task(args.name,
                                image_name=args.image,
                                instance_type='t3.medium')
    worker.upload(__file__)
    worker.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not scluster.running_locally():
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        worker._run_raw('killall python', ignore_errors=True)
        worker.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
    worker.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
    worker.switch_window(1)  # run in new tmux window
    if not scluster.running_locally():
        worker.run('source activate tensorflow_p36')
    worker.run(
        f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}'
    )
    print(worker.read('out'))
Пример #4
0
def run_launcher():
  import scluster

  if args.aws:
    scluster.set_backend('aws')

  if args.nightly:
    # running locally MacOS
    if 'Darwin' in util.ossystem('uname') and not args.aws:
      install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
    else:
      install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
  else:
    install_script = 'pip install ray'

  worker = scluster.make_task(name=args.name,
                              install_script=install_script,
                              image_name=args.image)
  if not scluster.running_locally():
    worker._run_raw('killall python', ignore_errors=True)
  worker.upload(__file__)
  worker.upload('util.py')
  if args.xray:
    worker.run('export RAY_USE_XRAY=1')
  worker.run('ray stop')

  resources = """--resources='{"ps": 1, "worker": 1}'"""
  worker.run(f"ray start --head {resources} --redis-port=6379")
  #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
  worker.run(
    f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}')
  print(worker.read('out'))
Пример #5
0
def launcher():
  import scluster
  
  if args.aws:
    scluster.set_backend('aws')

  job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  if args.aws:
    job.run('source activate pytorch_p36')
  else:
    job.run('source deactivate')
    job.run('source activate ncluster-test3')

  script_name = os.path.basename(__file__)
  common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
  job.tasks[0].run(f'python {script_name} --role=worker --rank=0 '+common_args,
                   non_blocking=True)
  job.tasks[1].run(f'python {script_name} --role=worker --rank=1 '+common_args,
                   non_blocking=True)

  job.tasks[0].join()
  print(job.tasks[0].read('out'))
Пример #6
0
def main():
  scluster.set_backend('local')

  job = scluster.make_job(num_tasks=2)

  start_time = time.time()
  job.run('sleep 1')
  print(f"waited for {time.time()-start_time} seconds")
Пример #7
0
def main():
    scluster.set_backend('aws')

    start_time = time.time()
    job = scluster.make_job(num_tasks=16)
    print(f"waited for startup for {time.time()-start_time} seconds")

    start_time = time.time()
    job.run('sleep 10')
    print(f"waited for exec for {time.time()-start_time} seconds")
Пример #8
0
def run_launcher():
    import scluster
    if args.aws:
        scluster.set_backend('aws')

    job = scluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
    job.upload(__file__)

    sender, receiver = job.tasks
    if scluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
    receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
                 non_blocking=True)
    sender.run(
        f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
Пример #9
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        print(f"asdfasdf {util.ossystem('uname')}")
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
            print(f"asdfasdf got install script {install_script}")
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = scluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=2)
    ps, worker = job.tasks
    if not scluster.running_locally():
        ps._run_raw('killall python', ignore_errors=True)
        worker._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{__file__} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
Пример #10
0
def run_launcher():
    import scluster
    if args.aws:
        scluster.set_backend('aws')

    job = scluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not scluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if scluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
Пример #11
0
def run_launcher():
    import scluster

    if args.aws:
        scluster.set_backend('aws')
    job = scluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image)
    job.upload(__file__)
    this_file = os.path.basename(__file__)

    sender, receiver = job.tasks
    if scluster.get_backend() == 'aws':
        # on AWS probably are running in DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
    job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}',
                     non_blocking=True)
    job.tasks[0].run(
        f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
    job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..',
                     non_blocking=True)
    print(
        f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
Пример #12
0
                    help="instance name")
parser.add_argument('--image-name', type=str,
                    default='Deep Learning AMI (Ubuntu) Version 15.0',
                    help="name of AMI to use ")
parser.add_argument('--instance-type', type=str, default='p3.2xlarge',
                    help="type of instance")
parser.add_argument('--password',
                    default='DefaultNotebookPasswordPleaseChange',
                    help='password to use for jupyter notebook')
parser.add_argument("--aws", action="store_true", help="enable to run on AWS")

args = parser.parse_args()
module_path = os.path.dirname(os.path.abspath(__file__))

if args.aws:
  scluster.set_backend('aws')

def main():
  task = scluster.make_task(name=args.name,
                            instance_type=args.instance_type,
                            image_name=args.image_name)

  # upload notebook config with provided password
  jupyter_config_fn = _create_jupyter_config(args.password)
  remote_config_fn = '~/.jupyter/jupyter_notebook_config.py'
  task.upload(jupyter_config_fn, remote_config_fn)

  # upload sample notebook and start Jupyter server
  task.run('mkdir -p /ncluster/notebooks')
  task.upload(f'{module_path}/gpubox_sample.ipynb',
              '/ncluster/notebooks/gpubox_sample.ipynb',