示例#1
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    script = os.path.basename(__file__)
    assert script in os.listdir('.')
    job = ncluster.make_job(install_script='pip install ray',
                            image_name=args.image,
                            instance_type='c5.large',
                            num_tasks=2)
    job.upload(script)
    job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""
    ps, worker = job.tasks
    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(
        f'./{script} --role=driver --ip={ps.ip}:6379 --size-mb={args.size_mb} --iters={args.iters}'
    )
示例#2
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
  job.upload(__file__)
  job.upload('util.py')

  sender, receiver = job.tasks
  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    sender._run_raw('killall python', ignore_errors=True)
    receiver._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python {__file__} --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(
    f'python {__file__} --role=sender {ip_config} --iters={args.iters} --size-mb={args.size_mb} --shards={args.shards}')
  print(sender.read('out'))
示例#3
0
def launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    if args.aws:
        job.run('source activate pytorch_p36')
    else:
        job.run('source deactivate')
        job.run('source activate ncluster-test3')

    script_name = os.path.basename(__file__)
    common_args = f'--size=2 --master-addr={job.tasks[0].ip} --iters={args.iters} --size-mb={args.size_mb}'
    job.tasks[0].run(f'python {script_name} --role=worker --rank=0 ' +
                     common_args,
                     non_blocking=True)
    job.tasks[1].run(f'python {script_name} --role=worker --rank=1 ' +
                     common_args,
                     non_blocking=True)

    job.tasks[0].join()
    print(job.tasks[0].read('out'))
示例#4
0
def main():
    print('start job ...')
    start_time = time.time()

    # 1. create infrastructure
    supported_regions = [
        'cn-huhehaote', 'cn-shanghai', 'cn-zhangjiakou', 'cn-hangzhou',
        'cn-beijing'
    ]
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)"

    ncluster_globals.set_should_disable_nas(True)

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}-{args.machines}",
                            num_tasks=args.machines,
                            instance_type=INSTANCE_TYPE,
                            disable_nas=True,
                            spot=True,
                            install_script='')

    init_ncluster = time.time()
    print('init ncluster:', init_ncluster - start_time)

    # 2. upload GTC code
    job.run('yum install -y unzip')
    job.upload('GTC')
    job.run(
        'cd GTC && wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/dataset.zip '
        +
        '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/test.JPG '
        +
        '&& wget http://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/gtc-demo/resnet50-19c8e357.pth '
        + '&& conda activate torch_1.3_cu10.0_py36')
    upload_data = time.time()
    print('upload_data time:', upload_data - init_ncluster)

    # 3. prepare the dataset
    job.run('unzip -o dataset.zip')
    unzip_time = time.time()
    print('unzip data:', unzip_time - upload_data)

    # 4. run the training job
    job.tasks[0].run('conda activate torch_1.3_cu10.0_py36')
    job.tasks[0].run('./run-perseus.sh 2>&1 | tee logs.log',
                     non_blocking=False)
    train_time = time.time()
    print('training time:', train_time - unzip_time)

    # 5. run the inference job
    job.tasks[0].run('python inference.py 2>&1 | tee logs.inference.log',
                     non_blocking=False)
    print('inference time:', time.time() - train_time)

    eclapse_time = time.time() - start_time
    print(f'training and inference deploy time is: {eclapse_time} s.')

    # 6. stop the instance (optional)
    job.stop()
示例#5
0
def test_multiple_logdir_tasks():
  n = 10
  dummy_task = ncluster.make_task()
  logdir1 = ncluster.get_logdir_root() + '/test1'
  dummy_task.run(f'rm -Rf {logdir1}')
  job = ncluster.make_job(run_name='test1', num_tasks=n)

  obtained_logdirs = []

  import wrapt

  @wrapt.synchronized
  def query(i):
    obtained_logdirs.append(job.tasks[i].logdir)

  threads = [threading.Thread(target=query, args=(i,)) for i in range(n)]
  for thread in reversed(threads):
    thread.start()

  random.shuffle(threads)
  for thread in threads:
    thread.join()

  assert len(set(obtained_logdirs)) == 1
  assert obtained_logdirs[0] == logdir1
def run_launcher():
    import ncluster

    if args.nightly:
        install_script = 'pip install --no-cache-dir -U ray --find-links ' \
                         'https://s3-us-west-2.amazonaws.com/ray-wheels/latest/'
    else:
        install_script = 'pip install -U ray'

    if args.local:
        ncluster.set_backend('local')

    job = ncluster.make_job(**vars(args))
    job.run(install_script)

    ps, worker = job.tasks
    if not ncluster.running_locally():
        ps.run('killall python || echo no python found')
        worker.run('killall || echo no python found')
        job.run('ray stop') 

    job.upload(__file__)
    job.upload('util.py')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    ps_resource = """--resources='{"ps": 1}'"""
    worker_resource = """--resources='{"worker": 1}'"""

    ps.run(f"ray start --head {ps_resource} --redis-port=6379")
    worker.run(f"ray start --redis-address={ps.ip}:6379 {worker_resource}")
    worker.run(f'python {__file__} --role=driver --ip={ps.ip}:6379 '
               f'--hidden_size={args.hidden_size} --num_layers={args.num_layers} '
               f'--iters={args.iters}')
    print(worker.read('out'))
示例#7
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    script = os.path.basename(__file__)
    if args.nightly:
        if args.macos:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            instance_type=args.instance,
                            num_tasks=args.num_workers + 1)
    job.upload(script)
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    driver = job.tasks[0]
    driver.run(f"ray start --head --redis-port=6379")
    for worker_task in job.tasks[1:]:
        worker_resource = """--resources='{"worker": 1}'"""
        worker_task.run(f"ray start --redis-address={driver.ip}:6379 "
                        f"{worker_resource}")
    driver.run(f'./{script} --role=driver --ip={driver.ip}:6379')
示例#8
0
def main():
    ncluster.set_backend('local')

    job = ncluster.make_job(num_tasks=2)

    start_time = time.time()
    job.run('sleep 1')
    print(f"waited for {time.time()-start_time} seconds")
示例#9
0
def main():
    ncluster.set_backend('aws')

    start_time = time.time()
    job = ncluster.make_job(num_tasks=16)
    print(f"waited for startup for {time.time()-start_time} seconds")

    start_time = time.time()
    job.run('sleep 10')
    print(f"waited for exec for {time.time()-start_time} seconds")
示例#10
0
def launcher():
    # run this test out of root directory of ncluster to capture .git and requirements.txt
    script_fn = 'tests/integration_test.py'

    import ncluster
    job = ncluster.make_job(**vars(args))
    job.rsync('.')
    job.run('pip install -r requirements.txt')
    task0 = job.tasks[0]

    task0.run(
        f'python {script_fn} --role=worker --name={args.name}-{random_id()} --local_rank=0',
        stream_output=True)
示例#11
0
def run_launcher():
  import ncluster

  job = ncluster.make_job('tf_adder', num_tasks=2)
  job.upload(__file__)
  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably are running in DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  job.tasks[1].run(f'python tf_adder.py --role=receiver {ip_config}', async=True)
  job.tasks[0].run(f'python tf_adder.py --role=sender {ip_config}')
示例#12
0
def launcher():
    job = ncluster.make_job(**vars(args))
    job.run('sudo apt install -y iperf3 nload')
    tasks = job.tasks

    for i in range(args.num_procs):
        ip = tasks[0].ip
        port = 6006+i
        tag = f"s{i}"
        tasks[0].switch_window(i)
        tasks[0].run(f'sudo iperf3 -s -p {port}', non_blocking=True)
        tasks[1].switch_window(i)
        tasks[1].run(f'sudo iperf3 -T {tag} -c {ip} -P {args.flows_per_proc} -i 1 -t {args.duration_sec} -V -p {port}',
                     non_blocking=True)
示例#13
0
def main():
    supported_regions = ['us-west-2', 'us-east-1', 'us-east-2']
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)"
    assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}"

    os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1'  # use io2 disk on AWS
    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}-{args.machines}",
                            num_tasks=args.machines,
                            image_name=IMAGE_NAME,
                            instance_type=INSTANCE_TYPE)

    job.rsync('.')
    #  job.upload('setup.sh')
    #  job.upload('worker_requirements.txt')  # todo(y): replace with rsync
    job.run('bash setup.sh')
    #  job.upload('training')
    job.run(f'source activate pytorch_source')

    nccl_params = get_nccl_params(args.machines, NUM_GPUS)

    # Training script args
    default_params = [
        '~/data/imagenet',
        '--fp16',
        '--logdir',
        job.logdir,
        '--name',
        args.name,
        '--distributed',
        '--init-bn0',
        '--no-bn-wd',
    ]

    params = ['--phases', schedules[args.machines]]
    training_params = default_params + params
    training_params = ' '.join(map(format_params, training_params))

    # TODO: simplify args processing, or give link to actual commands run
    for i, task in enumerate(job.tasks):
        dist_params = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}'
        task.run(
            f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")
示例#14
0
def main():
    assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}"
    # since we are using configurable name of conda env, modify install script
    # to run in that conda env
    install_script = open(INSTALL_SCRIPT_FN).read()
    install_script = f'source activate {ENV_NAME}\n' + install_script

    os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1'
    job = ncluster.make_job(name=args.name,
                            run_name=args.name,
                            num_tasks=args.machines,
                            image_name=IMAGE_NAME,
                            instance_type=INSTANCE_TYPE,
                            install_script=install_script,
                            preemptible=args.preemptible)
    job.upload('training')
    job.run(f'source activate {ENV_NAME}')

    world_size = NUM_GPUS * args.machines
    nccl_args = launch_utils_lib.get_nccl_args(args.machines, NUM_GPUS)

    # Training script args
    default_params = [
        '~/data/imagenet',
        '--fp16',
        '--logdir',
        job.logdir,
        '--distributed',
        '--init-bn0',
        '--no-bn-wd',
    ]

    params = ['--phases', schedules[args.machines]]

    training_args = default_params + params
    training_args = ' '.join(map(launch_utils_lib.format_args, training_args))

    # TODO: simplify args processing
    # Run tasks
    task_cmds = []
    for i, task in enumerate(job.tasks):
        dist_args = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
        cmd = f'{nccl_args} python -m torch.distributed.launch {dist_args} training/train_imagenet_nv.py {training_args}'
        task.run(
            f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, async=True)

    print(f"Logging to {job.logdir}")
def main():
    # job launches are asynchronous, can spin up multiple jobs in parallel
    job = ncluster.make_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT)
    job.join()

    head_task = job.tasks[0]
    head_task.run(f"ray start --head --redis-port={DEFAULT_PORT}")

    slave_task = job.tasks[1]
    slave_task.run("ray start --redis-address {head_task.ip}:{DEFAULT_PORT}")
    script_name = os.path.basename(BENCHMARK_URL)
    slave_task.run("rm -f " + script_name)
    slave_task.run("wget " + BENCHMARK_URL)
    slave_task.run("python " + script_name)

    print("To see results:")
    print(slave_task.connect_instructions)
示例#16
0
def run_launcher():
  import ncluster

  job = ncluster.make_job('tf_adder_tb', num_tasks=2)
  job.upload(__file__)
  this_file = os.path.basename(__file__)

  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably are running in DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  job.tasks[1].run(f'python {this_file} --role=receiver {ip_config}', non_blocking=True)
  job.tasks[0].run(f'python {this_file} --role=sender --logdir={job.logdir} {ip_config}')
  job.tasks[0].run(f'tensorboard --logdir={job.logdir}/..', non_blocking=True)
  print(f"Benchmark done, tensorboard at http://{job.tasks[0].public_ip}:6006")
示例#17
0
def run_launcher():
  import ncluster
  if args.aws:
    ncluster.set_backend('aws')

  job = ncluster.make_job('tf_adder', num_tasks=2, image_name=args.image)
  job.upload(__file__)
  
  sender, receiver = job.tasks
  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    job.run('source activate tensorflow_p36')

  ip_config = f'--sender-ip={sender.ip} --receiver-ip={receiver.ip}'
  receiver.run(f'python tf_adder.py --role=receiver {ip_config}',
               non_blocking=True)
  sender.run(f'python tf_adder.py --role=sender {ip_config} --iters={args.iters}')
示例#18
0
def main():
    job = ncluster.make_job(name=args.name,
                            instance_type=args.instance_type,
                            num_tasks=2,
                            disk_size=1000,
                            image_name=args.image_name)
    public_keys = {}
    for task in job.tasks:
        key_fn = '~/.ssh/id_rsa'
        task.run(f"yes | ssh-keygen -t rsa -f {key_fn} -N ''")
        public_keys[task] = task.read(key_fn + '.pub')

    for task1 in job.tasks:
        task1.run(
            """sudo bash -c 'echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config'"""
        )
        for task2 in job.tasks:
            # allow passwordless SSH from task1 to task2
            task2.run(f'echo "{public_keys[task1]}" >> ~/.ssh/authorized_keys')
示例#19
0
def launcher():
    job = ncluster.make_job(name=args.name,
                            num_tasks=args.machines,
                            image_name=args.image_name,
                            instance_type=args.instance_type,
                            spot=not args.nospot)
    print(f"Logging to {job.logdir}")
    tasks = job.tasks

    for i in range(args.num_procs):
        ip = tasks[0].ip
        port = 6006 + i
        tag = f"s{i}"
        tasks[0].switch_window(i)
        tasks[0].run(f'sudo iperf3 -s -p {port}', non_blocking=True)
        tasks[1].switch_window(i)
        tasks[1].run(
            f'sudo iperf3 -T {tag} -c {ip} -P {args.flows_per_proc} -i 1 -t {args.duration_sec} -V -p {port}',
            non_blocking=True)
示例#20
0
def main():
    supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local']
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}"
    assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}"

    job = ncluster.make_job(
        name=args.name,
        run_name=f"{args.name}-{args.machines}",
        num_tasks=args.machines,
        image_name=IMAGE_NAME,
        instance_type=INSTANCE_TYPE,
    )

    job.upload('training')
    if ncluster.get_region() == 'local':
        job.run('conda activate main')

    # specific to image
    else:
        job.run(f'conda activate fastai')

    nccl_params = get_nccl_params(args.machines, NUM_GPUS)

    # Training script args
    default_params = [
        # '--load', f'/ncluster/models/{args.name}.pth',
        #   '--save', f'/ncluster/models/{args.name}.pth'
    ]

    #   params = ['--phases', schedules[args.machines]]
    params = []
    training_params = default_params + params
    training_params = ' '.join(map(format_params, training_params))
    train_script = 'training/train.py'

    # TODO: simplify args processing, or give link to actual commands run
    for i, task in enumerate(job.tasks):

        dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}'
        # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)
示例#21
0
def main():
    supported_regions = ['us-west-2', 'us-east-1', 'us-east-2', 'local']
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()}"

    job = ncluster.make_job(
        name=args.name,
        run_name=f"{args.name}-{args.machines}",
        num_tasks=args.machines,
        image_name=IMAGE_NAME,
        instance_type=INSTANCE_TYPE,
    )

    job.upload('src')
    job.upload('scripts')
    job.run(f'conda activate midi')
    job.run('cd scripts')

    nccl_params = get_nccl_params(args.machines, NUM_GPUS)

    # Training script args
    default_params = [
        # '--load', f'/ncluster/models/{args.name}.pth',
        '--path',
        '~/data/midi/v10/midi_encode/'
    ]
    params = [
        '--save', f'large_single/lq/1_ep44', '--cache', 'tmp/lq',
        '--batch_size', '8', '--large', '--single_stream', '--epochs', '44',
        '--lr', '.008'
    ]
    training_params = default_params + params
    training_params = ' '.join(map(format_params, training_params))
    train_script = 'run_txl_npenc.py'

    # TODO: simplify args processing, or give link to actual commands run
    for i, task in enumerate(job.tasks):

        dist_params = f'--nproc_per_node={NUM_GPUS} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} {train_script} {training_params}'
        # task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)
示例#22
0
def run_launcher():
    import ncluster

    if args.aws:
        ncluster.set_backend('aws')

    if args.nightly:
        # running locally MacOS
        if 'Darwin' in util.ossystem('uname') and not args.aws:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-macosx_10_6_intel.whl'
        else:
            install_script = 'pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.5.2-cp36-cp36m-manylinux1_x86_64.whl'
    else:
        install_script = 'pip install ray'

    job = ncluster.make_job(name=args.name,
                            install_script=install_script,
                            image_name=args.image,
                            num_tasks=args.num_workers + args.num_ps)
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    job.upload(__file__)
    job.upload('util.py')
    if args.xray:
        job.run('export RAY_USE_XRAY=1')
    job.run('ray stop')

    head = job.tasks[0]

    # https://ray.readthedocs.io/en/latest/resources.html?highlight=resources
    worker_resource = """--resources='{"worker": 1}'"""
    head.run(f"ray start --head {worker_resource} --redis-port=6379")

    for task in job.tasks[1:]:
        task.run(f"ray start --redis-address={head.ip}:6379 {worker_resource}")

    head.run(
        f'python {__file__} --role=driver --ip={head.ip}:6379 --size-mb={args.size_mb} --iters={args.iters} --num-workers={args.num_workers} --num-ps={args.num_ps}'
    )

    print(head.read('out'))
示例#23
0
def run_launcher():
    import ncluster
    if args.aws:
        ncluster.set_backend('aws')

    job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image)
    job.upload(__file__)
    job.upload('util.py')

    # kill python just for when tmux session reuse is on
    if not ncluster.running_locally():
        job._run_raw('killall python', ignore_errors=True)

    if ncluster.get_backend() == 'aws':
        # on AWS probably running in conda DLAMI, switch into TF-enabled env
        job.run('source activate tensorflow_p36')

    hosts = [task.public_ip for task in job.tasks]
    host_str = ','.join(hosts)
    os.system(
        f'mpirun -np 2 --host {host_str} python {__file__} --role=worker')
    print(job.tasks[0].read('/tmp/out'))
示例#24
0
def launcher():

    job = ncluster.make_job('worker',
                            instance_type=args.instance_type,
                            num_tasks=2)
    job.join()

    print("Job ready for connection, to connect to most recent task:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()

    job.upload(__file__)

    job.run('source activate pytorch_p36')
    script_name = os.path.basename(__file__)
    job.tasks[0].run_async(
        f'python {script_name} --internal-role=worker --rank=0 --size=2 --master-addr={job.tasks[0].ip}'
    )
    job.tasks[1].run_async(
        f'python {script_name} --internal-role=worker --rank=1 --size=2 --master-addr={job.tasks[0].ip}'
    )
示例#25
0
def main():
  assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}"

  os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1'  # use io2 disk on AWS
  job = ncluster.make_job(name=args.name,
                          run_name=f"{args.name}-{args.machines}",
                          num_tasks=args.machines,
                          image_name=IMAGE_NAME,
                          instance_type=INSTANCE_TYPE,
                          install_script=open('setup.sh').read())
  job.upload('training')
  job.run(f'source activate pytorch_source')

  nccl_params = get_nccl_params(args.machines, NUM_GPUS)

  # Training script args
  default_params = [
    '~/data/imagenet',
    '--fp16',
    '--logdir', job.logdir,
    '--distributed',
    '--init-bn0',
    '--no-bn-wd',
  ]

  params = ['--phases', schedules[args.machines]]
  training_params = default_params + params
  training_params = ' '.join(map(format_params, training_params))

  # TODO: simplify args processing, or give link to actual commands run
  for i, task in enumerate(job.tasks):
    dist_params = f'--nproc_per_node=8 --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
    cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}'
    task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
    task.run(cmd, async=True)

  print(f"Logging to {job.logdir}")
示例#26
0
def run_launcher():
  import ncluster

  job = ncluster.make_job(args.name, num_tasks=2, image_name=args.image,
                          instance_type=args.instance_type)
  job.upload(__file__)
  job.upload('util.py')

  # kill python just for when tmux session reuse is on
  if not ncluster.running_locally():
    job._run_raw('killall python', ignore_errors=True)

  if ncluster.get_backend() == 'aws':
    # on AWS probably running in conda DLAMI, switch into TF-enabled env
    # TODO(y) switch to PyTorch enabled
    job.run('source activate tensorflow_p36')
    


  # TODO(y): this should be private ip
  hosts = [task.ip for task in job.tasks]
  host_str = ','.join(hosts)
  os.system(f'/usr/local/mpi/bin/mpirun -np 2 --host {host_str} python {__file__} --role=worker')
  print(job.tasks[0].read('/tmp/out'))
def launcher():
    # todo: flag for skip setup

    import ncluster
    job = ncluster.make_job(**vars(args))
    print(f"Logging to {job.logdir}")

    nccl_params = _get_nccl_params()

    # pass through launcher params to worker script
    assert '--role=launcher' in sys.argv, "how did you get here?"
    worker_params = sys.argv[1:]
    worker_params.remove('--role=launcher')
    worker_params.extend([f'--logdir {job.logdir}'])

    worker_params = ' '.join(worker_params)  # pass through all args

    dist_params0 = (f'--nproc_per_node={args.nproc_per_node} '
                    f'--nnodes={args.num_tasks} '
                    f'--master_addr={job.tasks[0].ip} '
                    f'--master_port={6016} ')

    job.rsync('.')
    worker_script_fn = os.path.basename(__file__)  # remote location

    job.run(f'killall -9 python || echo skipping && source activate {args.conda_env}')

    for i, task in enumerate(job.tasks):
        dist_params = dist_params0 + f'--node_rank={i} '
        cmd = (f'{nccl_params} python -m torch.distributed.launch {dist_params} {worker_script_fn} '
               f'{worker_params} ')
        task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)

    job.tasks[0].join()
    print(job.tasks[0].output)
def main():
    start_time = time.time()
    # 1. Create infrastructure
    supported_regions = [
        'cn-huhehaote', 'cn-zhangjiakou', 'cn-shanghai', 'cn-hangzhou',
        'cn-beijing'
    ]
    assert ncluster.get_region(
    ) in supported_regions, f"required AMI {IMAGE_NAME} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $ALYUN_DEFAULT_REGION)"

    ncluster_globals.set_should_disable_nas(True)

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}-{args.machines}",
                            num_tasks=args.machines,
                            disable_nas=True,
                            spot=True,
                            instance_type=INSTANCE_TYPE)

    # 2. Upload perseus bert code.
    job.run('yum install -y unzip')
    job.upload('perseus-bert')
    job.run('conda activate tensorflow_1.14_cu10.0_py36')

    # 3. Download pretrain model and dataset.
    BERT_CHINESE_BASE_DIR = '/root/chinese_L-12_H-768_A-12'
    DATA_DIR = '/root/toutiao_data'
    job.run(
        'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/chinese_L-12_H-768_A-12.zip  && unzip chinese_L-12_H-768_A-12.zip'
    )
    job.run(
        'wget -c -t 10 https://public-ai-datasets.oss-cn-huhehaote.aliyuncs.com/toutiao_data.tgz && tar xvf toutiao_data.tgz'
    )

    # 4. Run the training job.
    job.run('cd perseus-bert')
    hosts = [task.ip + f':{NUM_GPUS}' for task in job.tasks]
    host_str = ','.join(hosts)

    mpi_cmd = [
        'mpirun --allow-run-as-root', f'-np {args.machines * NUM_GPUS}',
        f'--npernode {NUM_GPUS}', f'--host {host_str}', '--bind-to none',
        '-x NCCL_DEBUG=INFO', '-x PATH', '-x PYTHONPATH', '-x LD_LIBRARY_PATH',
        '-x XLA_FLAGS'
    ]

    bert_classifier_cmd = [
        'python run_classifier.py', '--task_name=news', '--do_train=true',
        '--do_eval=true', f'--data_dir={DATA_DIR}',
        f'--vocab_file={BERT_CHINESE_BASE_DIR}/vocab.txt',
        f'--bert_config_file={BERT_CHINESE_BASE_DIR}/bert_config.json',
        f'--init_checkpoint={BERT_CHINESE_BASE_DIR}/bert_model.ckpt',
        '--max_seq_length=128', '--train_batch_size=48',
        '--learning_rate=8e-5', '--num_train_epochs=3.0',
        '--warmup_proportion=0.8', '--output_dir=/root/output_dir',
        '--use_amp=true', '--use_perseus=true', '--use_xla=true'
    ]

    cmd = mpi_cmd + bert_classifier_cmd
    cmd = " ".join(cmd)
    job.tasks[0].run(f'echo {cmd} > {job.logdir}/task-cmd')
    job.tasks[0].run(cmd, non_blocking=True)
    print(f"Logging to {job.logdir}")

    eclapse_time = time.time() - start_time
    print(f'training deploy time is: {eclapse_time} s.')

    job.stop()
示例#29
0
def main():
    config = AttrDefault(lambda: None, config_defaults)

    assert args.config in globals(), f"unknown config {args.config}"
    config.update(eval(args.config))

    job = ncluster.make_job(name=args.name,
                            run_name=f"{args.name}",
                            num_tasks=config.machines,
                            image_name=config.image_name,
                            instance_type=config.instance_type,
                            spot=not args.nospot,
                            skip_setup=args.skip_setup)

    job.rsync('.')
    job.run(f'killall python || echo failed && '  # kill previous run
            f'source activate {config.conda_env} && ' +
            f'pip install -r requirements.txt')

    instance_info = ncluster.aws_backend.INSTANCE_INFO[config.instance_type]
    num_gpus_per_machine = instance_info['gpus']

    total_gpus = num_gpus_per_machine * config.machines
    global_batch_size = config.batch_size * total_gpus

    # linear LR scaling (https://arxiv.org/abs/1706.02677)
    lr = config.base_lr * (global_batch_size / BASE_LR_BATCHSIZE)

    # TODO(y): change dataset location to /data/transformer-xl-data after
    # image is cut
    # worker parameters with training setup
    worker_params = {
        'seed': 1111,
        'data': 'data/wikitext-103',
        'dataset': 'wt103',
        'adaptive': True,
        'log_interval': 100,
        'eval_interval': 1000,
        'logdir': job.logdir,
        'lr': lr,
        'fp16': True,
        'dynamic_loss_scale': True,
        'batch_size': config.batch_size,
    }

    if config.architecture == 'wt103_large':
        worker_params.update(wt103_large)
    elif config.architecture == 'wt103_base':
        worker_params.update(wt103_base)
    else:
        assert False, f"Uknown architecture {config.architecture}"

    nccl_params = f'NCCL_DEBUG=VERSION NCCL_MIN_NRINGS={config.num_rings} '

    for i, task in enumerate(job.tasks):
        dist_params = \
            f'--nproc_per_node={num_gpus_per_machine} ' \
            f'--nnodes={config.machines} --node_rank={i} ' \
            f'--master_addr={job.tasks[0].ip} --master_port={6016} '
        cmd = f'{nccl_params} python -m torch.distributed.launch {dist_params} ' \
            f'train.py {util.dict_to_args(worker_params)}'
        task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
        task.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")

    if args.launch_tensorboard:
        task = ncluster.make_task('tensorboard',
                                  instance_type='r5.large',
                                  image_name=args.image_name)

        task.run('source activate tensorflow_p36')
        task.run(f'tensorboard --logdir={ncluster.get_logdir_root()} --port=6006',
                 non_blocking=True)
        print(f'TensorBoard at http://{task.public_ip}:6006')
示例#30
0
def main():
    if args.image_name == 'pytorch.imagenet.source.v7':
        supported_regions = ['us-west-2', 'us-east-1', 'us-east-2']
        assert ncluster.get_region(
        ) in supported_regions, f"required AMI {args.image_name} has only been made available in regions {supported_regions}, but your current region is {ncluster.get_region()} (set $AWS_DEFAULT_REGION)"
    assert args.machines in schedules, f"{args.machines} not supported, only support {schedules.keys()}"

    if args.mount_imagenet:
        datadir = '/data/imagenet'
    else:
        datadir = '~/data/imagenet'
        os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1'  # use io2 disk on AWS

    if args.num_tasks >= 16:
        assert args.simple_ring_setup, "must use --simple_ring_setup, otherwise NCCL_RINGS env var exceeds cmd-line limit"

    job = ncluster.make_job(
        name=args.name,
        run_name=args.run_name,
        num_tasks=args.machines,
        image_name=args.image_name,
        instance_type=args.instance_type,
        disk_size=500,
        spot=args.spot,
        skip_setup=args.skip_setup,
    )

    task0 = job.tasks[0]
    _logdir = task0.logdir  # workaround for race condition in creating logdir

    config = {}
    for key in os.environ:
        if re.match(r"^NCLUSTER", key):
            config['env_' + key] = os.getenv(key)
    config.update(vars(args))

    CUDA_HOME = f'/usr/local/cuda'
    EFA_HOME = f'/opt/amazon/efa'
    MPI_HOME = EFA_HOME
    NPROC_PER_NODE = args.nproc_per_node
    assert NPROC_PER_NODE <= task0.num_gpus, f"requested {NPROC_PER_NODE} processes, but only {task0.num_gpus} gpus present"
    NUM_GPUS = NPROC_PER_NODE * args.num_tasks

    config['NUM_GPUS'] = NUM_GPUS

    config['internal_id'] = u.get_account_number()
    config['internal_alias'] = u.get_account_name()
    config['region'] = u.get_region()
    config['zone'] = u.get_zone()
    config['launch_user'] = os.environ.get('USER', '')
    config['cmd'] = ' '.join(sys.argv)
    config['launcher_conda'] = util.ossystem(
        'echo ${CONDA_PREFIX:-"$(dirname $(which conda))/../"}')
    config['launcher_cmd'] = 'python ' + ' '.join(sys.argv)
    config['logdir'] = job.logdir

    pickled_config = util.text_pickle(config)
    if args.log_all_workers:
        job.write(args.internal_config_fn, pickled_config)
    else:
        job.tasks[0].write(args.internal_config_fn, pickled_config)

    if args.mount_imagenet:
        assert u.get_zone(), "Must specify zone when reusing EBS volumes"
        mount_imagenet(job)

    if not args.skip_setup:
        job.run(
            'rm -f *.py')  # remove files backed into imagenet18 release image
        job.run('conda init')  # missing .bashrc
        job.run(
            f'{{ source activate {args.conda_env} && bash setup.sh && pip install -U protobuf ; }}  && {{ killall python || echo hi ; }} '
        )
        if args.pytorch_nightly:
            job.run(
                'conda install -y -c pytorch pytorch-nightly && bash setup.sh')
    else:
        job.run([
            f'source ~/.bashrc && conda activate {args.conda_env}',
            f'killall python || echo hi'
        ])

    job.rsync('.')

    if args.efa:
        assert 'efa' in args.image_name  # make sure we use EFA-enabled image
        hosts_str, hosts_file_str = util.setup_mpi(
            job, skip_ssh_setup=args.skip_setup)
        if not args.skip_setup:
            task0.write(HOSTS_SLOTS_FN, hosts_file_str)

    env_params = get_nccl_params(args.machines, args.nproc_per_node)
    if args.cuda_debug:
        env_params += 'CUDA_LAUNCH_BLOCKING=1 NCCL_DEBUG=INFO '
    else:
        env_params += 'NCCL_DEBUG=INFO '

    env_params += " OMP_NUM_THREADS=1 "
    if args.pytorch_use_spawn:
        assert args.pytorch_nightly
        env_params += " PYTORCH_USE_SPAWN=1 "
    if 'WANDB_API_KEY' in os.environ:
        env_params += f" WANDB_API_KEY={os.environ.get('WANDB_API_KEY')} "

    # Training script args
    default_params = [
        datadir,
        '--fp16',
        '--logdir',
        job.logdir,
        '--name',
        f'{args.run_name}-{util.random_id()}',
        '--distributed',
        '--init-bn0',
        '--no-bn-wd',
        '--log_all_workers',
        args.log_all_workers,
    ]

    params = ['--phases', util.text_pickle(schedules[args.machines])]
    training_params = default_params + params
    training_params = ' '.join(map(format_params, training_params))

    if not args.efa:
        # TODO: simplify args processing, or give link to actual commands run
        for i, task in enumerate(job.tasks):
            dist_params = f'--nproc_per_node={args.nproc_per_node} --nnodes={args.machines} --node_rank={i} --master_addr={job.tasks[0].ip} --master_port={6006}'
            cmd = f'{env_params} python -m torch.distributed.launch {dist_params} training/train_imagenet_nv.py {training_params}'
            task.run(
                f'echo {cmd} > {job.logdir}/task-{i}.cmd')  # save command-line
            task.run(cmd, non_blocking=True)
    else:
        FI_PROVIDER = 'efa'
        if args.pseudo_efa:
            FI_PROVIDER = 'sockets'

        local_env = util.format_env_export(
            LOCAL_RANK='$OMPI_COMM_WORLD_LOCAL_RANK',
            RANK='$OMPI_COMM_WORLD_RANK',
            WORLD_SIZE='$OMPI_COMM_WORLD_SIZE',
            MASTER_ADDR=task0.ip,
            MASTER_PORT=6016)

        mpi_env = util.format_env_x(
            FI_PROVIDER=
            FI_PROVIDER,  # Enables running nccl-tests using EFA provider.
            FI_OFI_RXR_RX_COPY_UNEXP=
            1,  #  Disables using bounce buffers for unexpected messages.
            FI_OFI_RXR_RX_COPY_OOO=
            1,  # Disables using bounce buffers for out of order messages.
            FI_EFA_MR_CACHE_ENABLE=1,  # Enables memory region caching.
            FI_OFI_RXR_INLINE_MR_ENABLE=
            1,  # Enables inline memory registration of data buffers.
            NCCL_TREE_THRESHOLD=10 *
            4294967296,  # force tree for everything under 40GB
            LD_LIBRARY_PATH=
            f'{CUDA_HOME}/lib:{CUDA_HOME}/lib64:{EFA_HOME}/lib64',
            NCCL_DEBUG='INFO',
            OMP_NUM_THREADS=1,
            WANDB_API_KEY=os.environ.get('WANDB_API_KEY', ''),
            PYTORCH_USE_SPAWN=args.pytorch_use_spawn,
            NO_WANDB=args.pytorch_use_spawn,
        )
        if args.no_op:
            worker_script_fn = 'training/env_test.py'
        else:
            worker_script_fn = 'training/train_imagenet_nv.py'

        local_cmd = [
            f"{local_env} && source ~/.bashrc && conda activate {args.conda_env} && ",
            f'python {worker_script_fn} {training_params} --local_rank=$OMPI_COMM_WORLD_LOCAL_RANK'
        ]
        local_cmd = ' '.join(local_cmd)

        cmd = [
            f"{MPI_HOME}/bin/mpirun -n {NUM_GPUS} -N {NPROC_PER_NODE} --hostfile {HOSTS_SLOTS_FN} ",
            f'{mpi_env} ',
            f'--mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 ',
            f'--bind-to none ', f"bash -c '{local_cmd}'"
        ]
        cmd = ' '.join(cmd)

        task0.run(cmd, non_blocking=True)

    print(f"Logging to {job.logdir}")