示例#1
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    script = os.path.basename(__file__)
    if args.nightly:
        if args.macos:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            instance_type=args.instance,
                            num_tasks=args.num_workers + 1)
    job.upload(script)
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    driver = job.tasks[0]
    driver.run(f"ray start --head --redis-port=6379")
    for worker_task in job.tasks[1:]:
        worker_resource = """--resources='{"worker": 1}'"""
        worker_task.run(f"ray start --redis-address={driver.ip}:6379 "
                        f"{worker_resource}")
    driver.run(f'./{script} --role=driver --ip={driver.ip}:6379')
示例#2
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
def run_launcher():
    import ncluster

    if args.nightly:
        install_script = 'pip install --no-cache-dir -U ray --find-links ' \
                         'https://s3-us-west-2.amazonaws.com/ray-wheels/latest/'
    else:
        install_script = 'pip install -U ray'

    if args.local:
        ncluster.set_backend('local')

    job = ncluster.make_job(**vars(args))
    job.run(install_script)

    ps, worker = job.tasks
    if not ncluster.running_locally():
        ps.run('killall python || echo no python found')
        worker.run('killall || echo no python found')
        job.run('ray stop') 

    job.upload(__file__)
    job.upload('util.py')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(f'python {__file__} --role=driver --ip={ps.ip}:6379 '
               f'--hidden_size={args.hidden_size} --num_layers={args.num_layers} '
               f'--iters={args.iters}')
    print(worker.read('out'))
示例#4
0
def launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    if args.aws:
        job.run('source activate pytorch_p36')
    else:
        job.run('source deactivate')
        job.run('source activate ncluster-test3')

    script_name = os.path.basename(__file__)
    common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
    job.tasks[0].run(f'python {script_name} --role=worker --rank=0 ' +
                     common_args,
                     non_blocking=True)
    job.tasks[1].run(f'python {script_name} --role=worker --rank=1 ' +
                     common_args,
                     non_blocking=True)

    job.tasks[0].join()
    print(job.tasks[0].read('out'))
示例#5
0
def run_launcher():
  import ncluster
  ncluster.util.assert_script_in_current_directory()
  
  if args.aws:
    ncluster.set_backend('aws')

  # use 4GB instance, 0.5GB not enough
  worker = ncluster.make_task(args.name, image_name=args.image,
                              instance_type='t3.medium')
  worker.upload(__file__)
  worker.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    worker._run_raw('killall python', ignore_errors=True)
    worker.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={worker.ip} --receiver-ip={worker.ip}'
  worker.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  worker.switch_window(1)  # run in new tmux window
  if not ncluster.running_locally():
    worker.run('source activate tensorflow_p36')
  worker.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(worker.read('out'))
示例#6
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    script = os.path.basename(__file__)
    assert script in os.listdir('.')
    job = ncluster.make_job(install_script='pip install ray',
                            image_name=args.image,
                            instance_type='c5.large',
                            num_tasks=2)
    job.upload(script)
    job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""
    ps, worker = job.tasks
    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
示例#7
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    worker = ncluster.make_task(name=args.name,
                                install_script=install_script,
                                image_name=args.image)
    if not ncluster.running_locally():
        worker._run_raw('killall python', ignore_errors=True)
    worker.upload(__file__)
    worker.upload('util.py')
    if args.xray:
        worker.run('export RAY_USE_XRAY=1')
    worker.run('ray stop')

    resources = """--resources='{"ps": 1, "worker": 1}'"""
    worker.run(f"ray start --head {resources} --redis-port=6379")
    #  worker.run(f"ray start --redis-address={worker.ip}:6379 {resources}")
    worker.run(
        f'./{__file__} --role=driver --ip={worker.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
    print(worker.read('out'))
示例#8
0
def main():
    ncluster.set_backend('local')

    job = ncluster.make_job(num_tasks=2)

    start_time = time.time()
    job.run('sleep 1')
    print(f"waited for {time.time()-start_time} seconds")
示例#9
0
def main():
    ncluster.set_backend('aws')

    start_time = time.time()
    job = ncluster.make_job(num_tasks=16)
    print(f"waited for startup for {time.time()-start_time} seconds")

    start_time = time.time()
    job.run('sleep 10')
    print(f"waited for exec for {time.time()-start_time} seconds")
示例#10
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
  job.upload(__file__)
  
  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
示例#11
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=args.num_workers + args.num_ps)
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    head = job.tasks[0]

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    worker_resource = """--resources='{"worker": 1}'"""
    head.run(f"ray start --head {worker_resource} --redis-port=6379")

    for task in job.tasks[1:]:
        task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")

    head.run(
        f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}'
    )

    print(head.read('out'))
示例#12
0
def run_launcher():
    import ncluster
    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if ncluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
示例#13
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')
    job = ncluster.make_job('tf_adder_tb', num_tasks=2, image_name=args.image)
    job.upload(__file__)
    this_file = os.path.basename(__file__)

    sender, receiver = job.tasks
    if ncluster.get_backend() == 'aws':
        # on AWS probably are running in DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
    job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}',
                     non_blocking=True)
    job.tasks[0].run(
        f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
    job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..',
                     non_blocking=True)
    print(
        f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
#!/usr/bin/env python

import argparse
import ncluster
import os
import time
from ncluster import ncluster_globals

# setting parameters
INSTANCE_TYPE = 'ecs.gn6v-c10g1.20xlarge'
NUM_GPUS = 8

ncluster.set_backend('aliyun')
parser = argparse.ArgumentParser()
parser.add_argument(
    '--name',
    type=str,
    default='fastgpu-perseus-bert',
    help=
    "name of the current run, used for machine naming and tensorboard visualization"
)
parser.add_argument('--machines',
                    type=int,
                    default=1,
                    help="how many machines to use")
args = parser.parse_args()


def main():
    start_time = time.time()
    # 1. Create infrastructure
示例#15
0
#!/usr/bin/env python

import argparse
import ncluster
import os

IMAGE_NAME = 'pytorch.imagenet.source.v7'
INSTANCE_TYPE = 'p3.16xlarge'
NUM_GPUS = 8

ncluster.set_backend('aws')
parser = argparse.ArgumentParser()
parser.add_argument(
    '--name',
    type=str,
    default='imagenet',
    help=
    "name of the current run, used for machine naming and tensorboard visualization"
)
parser.add_argument('--machines',
                    type=int,
                    default=16,
                    help="how many machines to use")
args = parser.parse_args()

# 109:12 to 93.00
# events: https://s3.amazonaws.com/yaroslavvb/logs/imagenet-1
# logs: https://s3.amazonaws.com/yaroslavvb/logs/imagenet1.tar
lr = 1.0
scale_224 = 224 / 512
scale_288 = 128 / 512
示例#16
0
def main():
    ncluster.set_backend('aws')

    if args.config:
        assert not args.instance_type, "specify instance_type as part of config"
        assert not args.machines, "specify number of machines as part of config"
        assert re.match('\\w+', args.config)
        assert args.config in globals(), f'no config called {args.config}'
        config = eval(args.config)

    else:  # setting config vars through command-line flags
        assert args.instance_type
        assert args.machines
        config = {'base_lr': 0.000125 * 5 / 3,
                  'local_batch_size': 96,
                  'instance_type': args.instance_type,
                  'machines': args.machines}

    config = AttrDefault(str, config)  # easier access to dictionary entries
    config.image_name = IMAGE_NAME
    config.conda_env = CONDA_ENV

    if args.conda_env:
        config.conda_env = args.conda_env
        print("Using non-standard conda env ", config.conda_env)
    if args.image_name:
        config.image_name = args.image_name
        print("Using non-standard image ", config.image_name)

    instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type]
    num_gpus_per_machine = instance_info['gpus']

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}",
                            num_tasks=config.machines,
                            image_name=config.image_name,
                            instance_type=config.instance_type,
                            spot=not args.nospot,
                            skip_setup=args.skip_setup)

    job.rsync('.')
    job.run(f'killall python || echo failed && '  # kill previous run
            f'source activate {config.conda_env} && ' +
            f'pip install -r requirements.txt')

    local_batch_size = config.local_batch_size
    base_lr = config.base_lr

    num_workers = num_gpus_per_machine * config.machines
    global_batch_size = local_batch_size * num_workers
    print("using global batch ", global_batch_size)  # 512=8*32*2*1

    # linear LR scaling (https://arxiv.org/abs/1706.02677)
    lr = base_lr * (global_batch_size / BASE_LR_BATCHSIZE)

    # worker parameters with training setup
    worker_params = {
        'seed': 1111,
        'data': 'data/wikitext-103',
        'dataset': 'wt103',
        'adaptive': True,
        'log_interval': 100,
        'eval_interval': 500,
        'max_tokens': int(1.5e9),
        'logdir': job.logdir,
        'lr': lr,
        'batch_size': local_batch_size,
        'eta_min': lr / 10,
    }
    
    worker_params.update(LARGE_ARGS if config.large else SMALL_ARGS)

    user_params = {}
    # pass through some user-provided settings that were arguments to the launcher script
    if args.checkpoint_each_epoch:
        user_params['checkpoint_each_epoch'] = args.checkpoint_each_epoch
    if config.warmup_tokens:
        user_params['warmup_tokens'] = config.warmup_tokens

    if args.checkpoint or config.checkpoint:
        user_params['checkpoint'] = util.one_of([args.checkpoint, config.checkpoint])

    if args.wiki:
        worker_params.update({
            'data': 'data/wikiextracted',
            'dataset': 'wiki',
            'dropatt': 0.1,
            'dropout': 0.1,
        })

    if args.bpe:
        worker_params.update({
            'div_val': 1,
            'bpe': True,
            'adaptive': False,
        })

    worker_params.update(user_params)

    if config.extra_worker_params:
        worker_params.update(config.extra_worker_params)

    nccl_params = _get_nccl_params()

    for i, task in enumerate(job.tasks):
        dist_params = \
            f'--nproc_per_node={num_gpus_per_machine} ' \
            f'--nnodes={config.machines} --node_rank={i} ' \
            f'--master_addr={job.tasks[0].ip} --master_port={6016}'
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} train.py {dict_to_args(worker_params)}'
        task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")
#!/usr/bin/env python
# Run crashing TensorFlow SVD example

import ncluster
ncluster.set_backend('aws')

import argparse
parser = argparse.ArgumentParser(description='launch')
parser.add_argument('--instance', default='c5.9xlarge')
parser.add_argument('--image', default="Deep Learning AMI (Amazon Linux) Version 13.0")
args = parser.parse_args()

def main():
  task = ncluster.make_task(instance_type=args.instance,
                            image_name=args.image)
  task.run('source activate tensorflow_p36')
  task.upload('tensorflow_svd_crash.py')
  stdout, stderr = task.run_with_output('python tensorflow_svd_crash.py')
  print(stdout, stderr)

if __name__=='__main__':
  main()
示例#18
0
    '--name',
    type=str,
    default='txl',
    help=
    "name of the current run, used for machine naming and tensorboard visualization"
)
parser.add_argument('--machines',
                    type=int,
                    default=1,
                    help="how many machines to use")
parser.add_argument("--local",
                    action="store_true",
                    help="enable to run on AWS")
args = parser.parse_args()

if not args.local: ncluster.set_backend('aws')


# routines to build NCCL ring orders
def get_nccl_params(num_tasks, num_gpus):
    if num_tasks <= 1:
        return 'NCCL_DEBUG=VERSION'
    return 'NCCL_MIN_NRINGS=4 NCCL_SINGLE_RING_THRESHOLD=10 NCCL_DEBUG=VERSION'


def format_params(arg):
    if isinstance(arg, list) or isinstance(arg, dict):
        return '\"' + str(arg) + '\"'
    else:
        return str(arg)