示例#1
0
def main():
    if args.role == 'launcher':
        assert "AWS_DEFAULT_REGION" in os.environ
        assert os.environ.get("AWS_DEFAULT_REGION") in ami_dict
        AMI = ami_dict[os.environ.get("AWS_DEFAULT_REGION")]

        import aws
        job = aws.simple_job(args.run,
                             num_tasks=1,
                             instance_type=args.instance_type,
                             install_script=INSTALL_SCRIPT,
                             ami=AMI,
                             linux_type=LINUX_TYPE)
        task = job.tasks[0]
        job.initialize()
        job.wait_until_ready()

        task.run('cd ~')
        task.upload(__file__)  # copies current script onto machine
        task.run("python %s --role=worker" % (__file__, )
                 )  # runs script and streams output locally to file in /temp
        print("To connect:")
        print(task.connect_instructions)

    elif args.role == 'worker':
        import sys, time
        print('Python version is ' + str(sys.version))
示例#2
0
def main():
    import aws
    import os

    # job launches are asynchronous, can spin up multiple jobs in parallel
    job = aws.simple_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT)

    # block until things launch to run commands
    job.wait_until_ready()

    head_task = job.tasks[0]
    head_task.run("ray start --head --redis-port=%d" % (DEFAULT_PORT, ))

    slave_task = job.tasks[1]
    slave_task.run("ray start --redis-address %s:%d" %
                   (head_task.ip, DEFAULT_PORT))
    script_name = os.path.basename(BENCHMARK_URL)
    slave_task.run("rm -f " + script_name)
    slave_task.run("wget " + BENCHMARK_URL)
    slave_task.run("python " + script_name)

    print("To see results:")
    print("ssh -i %s -o StrictHostKeyChecking=no ubuntu@%s" %
          (os.environ['SSH_KEY_PATH'], slave_task.public_ip))
    print("tmux a -t tmux")
示例#3
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws

    # job launches are asynchronous, can spin up multiple jobs in parallel
    if args.placement:
        placement_name = args.run
    else:
        placement_name = ''
    print("Launching job")
    job = aws.simple_job(args.run,
                         num_tasks=3,
                         instance_type='c5.18xlarge',
                         install_script=INSTALL_SCRIPT,
                         placement_group=placement_name)

    # block until things launch to run commands
    job.wait_until_ready()

    # start ray on head node
    head_task = job.tasks[0]
    head_task.run('ray stop   || echo "ray not started, ignoring"')
    head_task.run("ray start --head --redis-port=%d --num-gpus=0 \
                           --num-cpus=10000 --num-workers=10" %
                  (DEFAULT_PORT, ))

    # start ray on slave node1
    slave_task1 = job.tasks[1]
    slave_task1.run('ray stop   || echo "ray not started, ignoring"')
    slave_task1.run(
        "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \
                            --num-workers=0" % (head_task.ip, DEFAULT_PORT))

    # start ray on slave node2
    slave_task2 = job.tasks[2]
    slave_task2.run('ray stop   || echo "ray not started, ignoring"')
    slave_task2.run(
        "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \
                            --num-workers=0" % (head_task.ip, DEFAULT_PORT))

    # download benchmark script and exeucte it on head node
    head_task.run("rm -f " + SCRIPT_NAME)
    head_task.upload(SCRIPT_NAME)
    head_task.run("python %s \
                  --num-workers=1 \
                  --num-parameter-servers=1 \
                  --dim=25000 \
                  --redis-address=%s:%d" %
                  (SCRIPT_NAME, head_task.ip, DEFAULT_PORT))

    print("To see results:")
    print(head_task.connect_instructions)
示例#4
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws

    # make sure you are using correct aws module (it moved one level up)
    import inspect
    current_location = os.path.dirname(os.path.abspath(__file__))
    aws_location = os.path.dirname(os.path.abspath(inspect.getsourcefile(aws)))
    assert len(aws_location) < len(
        current_location
    ), "Using wrong aws module, delete aws.py in current directory."

    # job launches are asynchronous, can spin up multiple jobs in parallel
    if args.placement:
        placement_name = args.run
    else:
        placement_name = ''
    print("Launching job")
    job = aws.simple_job(args.run,
                         num_tasks=2,
                         instance_type='c5.18xlarge',
                         install_script=INSTALL_SCRIPT,
                         placement_group=placement_name)

    # block until things launch to run commands
    job.wait_until_ready()

    # start ray on head node
    head_task = job.tasks[0]
    head_task.run('ray stop   || echo "ray not started, ignoring"')
    head_task.run("ray start --head --redis-port=%d --num-gpus=0 \
                           --num-cpus=10000 --num-workers=10" %
                  (DEFAULT_PORT, ))

    # start ray on slave node
    slave_task = job.tasks[1]
    slave_task.run('ray stop   || echo "ray not started, ignoring"')
    slave_task.run("ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \
                            --num-workers=0" % (head_task.ip, DEFAULT_PORT))

    # download benchmark script and exeucte it on slave node
    slave_task.run("rm -f " + SCRIPT_NAME)
    slave_task.upload(SCRIPT_NAME)
    slave_task.run("python %s \
                    --redis-address=%s:%d --num-workers=10 \
                    --num-parameter-servers=4 \
                    --data-size=100000000" %
                   (SCRIPT_NAME, head_task.ip, DEFAULT_PORT))

    print("To see results:")
    print(slave_task.connect_instructions)
示例#5
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws

    assert 'AWS_DEFAULT_REGION' in os.environ
    assert os.environ.get("AWS_DEFAULT_REGION") in ami_dict
    assert os.environ.get(
        "AWS_DEFAULT_REGION"
    ) == 'us-west-2', "Currently EFS is hardwired to us-west-2 region"

    AMI = 'ami-0def3275'  # Ubuntu 16.04 in us-west-2
    worker_job = aws.simple_job(args.name + '-worker',
                                num_tasks=1,
                                instance_type=args.worker_instance,
                                install_script=INSTALL_SCRIPT,
                                ami=AMI)
    tb_job = aws.simple_job(args.name + '-tb',
                            num_tasks=1,
                            instance_type=args.tb_instance,
                            install_script=INSTALL_SCRIPT,
                            ami=AMI)

    # block until things launch to run commands
    worker_job.wait_until_ready()
    tb_job.wait_until_ready()

    worker = worker_job.tasks[0]
    tb = tb_job.tasks[0]

    logdir = '/efs/runs/' + args.name
    worker.run('python cifar10_main.py --model_dir=' + logdir)
    tb.run('tensorboard --logdir=' + logdir)

    print("Connect to worker:")
    print(worker.connect_instructions)
    print("See tensorboard at http://%s:%d" % (tb.public_ip, 6006))
示例#6
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws

    job = aws.simple_job(args.name,
                         num_tasks=args.num_tasks,
                         instance_type=args.instance_type,
                         install_script=INSTALL_SCRIPT,
                         ami=AMI,
                         linux_type='debian')

    # block until things launch to run commands
    job.wait_until_ready()

    for task_num, task in enumerate(job.tasks):
        for i in range(args.num_tasks):
            task.run("ping worker%d.tf.local -c 1" % (i, ))
            time.sleep(1)
        print("To connect to task %d: " % (i, ))
        print(task.connect_instructions)
示例#7
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws

    # job launches are asynchronous, can spin up multiple jobs in parallel
    if args.placement:
        placement_name = args.run
    else:
        placement_name = ''
    job = aws.simple_job(args.run,
                         num_tasks=2,
                         instance_type='c5.18xlarge',
                         install_script=INSTALL_SCRIPT,
                         placement_group=placement_name,
                         ami=AMI,
                         linux_type='debian')

    # block until things launch to run commands
    job.wait_until_ready()

    head_task = job.tasks[0]
    head_task.run(
        "ray start --head --redis-port=%d --num-gpus=0 --num-cpus=10000 --num-workers=10"
        % (DEFAULT_PORT, ))

    slave_task = job.tasks[1]
    script_name = os.path.basename(BENCHMARK_URL)
    slave_task.run("rm -f " + script_name)
    slave_task.run("wget " + BENCHMARK_URL)

    slave_task.run(
        "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 --num-workers=0"
        % (head_task.ip, DEFAULT_PORT))
    slave_task.run(
        "python async_sgd_benchmark_multinode.py --redis-address=%s:%d --num-workers=10 --num-parameter-servers=4 --data-size=100000000"
        % (head_task.ip, DEFAULT_PORT))

    print("To see results:")
    print(slave_task.connect_instructions)
示例#8
0
def main():
  if args.role == 'launcher':
    add_parent_path()
    import aws
    job = aws.simple_job('simple', num_tasks=1, install_script=INSTALL_SCRIPT)
    task = job.tasks[0]
    job.wait_until_ready()
    task.upload(__file__)   # copies current script onto machine
    
    task.run("python %s --role=worker" % (__file__,)) # runs script and streams output locally to file in /temp
    print("To connect:")
    print(task.connect_instructions)
  elif args.role == 'worker':
    import sys, time
    print('hello world')
    print('Python version is '+str(sys.version))
    for i in range(10000):
      time.sleep(1)
      print("step %d"%(i,))
    
  else:
    print('Unknown role')
示例#9
0
def main():
    import aws
    import os

    # job launches are asynchronous, can spin up multiple jobs in parallel
    job = aws.simple_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT)

    # block until things launch to run commands
    job.wait_until_ready()

    head_task = job.tasks[0]
    head_task.run("ray start --head --redis-port=%d" % (DEFAULT_PORT, ))

    slave_task = job.tasks[1]
    slave_task.run("ray start --redis-address %s:%d" %
                   (head_task.ip, DEFAULT_PORT))
    script_name = os.path.basename(BENCHMARK_URL)
    slave_task.run("rm -f " + script_name)
    slave_task.run("wget " + BENCHMARK_URL)
    slave_task.run("python " + script_name)

    print("To see results:")
    print(slave_task.connect_instructions)