def main(): if args.role == 'launcher': assert "AWS_DEFAULT_REGION" in os.environ assert os.environ.get("AWS_DEFAULT_REGION") in ami_dict AMI = ami_dict[os.environ.get("AWS_DEFAULT_REGION")] import aws job = aws.simple_job(args.run, num_tasks=1, instance_type=args.instance_type, install_script=INSTALL_SCRIPT, ami=AMI, linux_type=LINUX_TYPE) task = job.tasks[0] job.initialize() job.wait_until_ready() task.run('cd ~') task.upload(__file__) # copies current script onto machine task.run("python %s --role=worker" % (__file__, ) ) # runs script and streams output locally to file in /temp print("To connect:") print(task.connect_instructions) elif args.role == 'worker': import sys, time print('Python version is ' + str(sys.version))
def main(): import aws import os # job launches are asynchronous, can spin up multiple jobs in parallel job = aws.simple_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT) # block until things launch to run commands job.wait_until_ready() head_task = job.tasks[0] head_task.run("ray start --head --redis-port=%d" % (DEFAULT_PORT, )) slave_task = job.tasks[1] slave_task.run("ray start --redis-address %s:%d" % (head_task.ip, DEFAULT_PORT)) script_name = os.path.basename(BENCHMARK_URL) slave_task.run("rm -f " + script_name) slave_task.run("wget " + BENCHMARK_URL) slave_task.run("python " + script_name) print("To see results:") print("ssh -i %s -o StrictHostKeyChecking=no ubuntu@%s" % (os.environ['SSH_KEY_PATH'], slave_task.public_ip)) print("tmux a -t tmux")
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws # job launches are asynchronous, can spin up multiple jobs in parallel if args.placement: placement_name = args.run else: placement_name = '' print("Launching job") job = aws.simple_job(args.run, num_tasks=3, instance_type='c5.18xlarge', install_script=INSTALL_SCRIPT, placement_group=placement_name) # block until things launch to run commands job.wait_until_ready() # start ray on head node head_task = job.tasks[0] head_task.run('ray stop || echo "ray not started, ignoring"') head_task.run("ray start --head --redis-port=%d --num-gpus=0 \ --num-cpus=10000 --num-workers=10" % (DEFAULT_PORT, )) # start ray on slave node1 slave_task1 = job.tasks[1] slave_task1.run('ray stop || echo "ray not started, ignoring"') slave_task1.run( "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \ --num-workers=0" % (head_task.ip, DEFAULT_PORT)) # start ray on slave node2 slave_task2 = job.tasks[2] slave_task2.run('ray stop || echo "ray not started, ignoring"') slave_task2.run( "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \ --num-workers=0" % (head_task.ip, DEFAULT_PORT)) # download benchmark script and exeucte it on head node head_task.run("rm -f " + SCRIPT_NAME) head_task.upload(SCRIPT_NAME) head_task.run("python %s \ --num-workers=1 \ --num-parameter-servers=1 \ --dim=25000 \ --redis-address=%s:%d" % (SCRIPT_NAME, head_task.ip, DEFAULT_PORT)) print("To see results:") print(head_task.connect_instructions)
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws # make sure you are using correct aws module (it moved one level up) import inspect current_location = os.path.dirname(os.path.abspath(__file__)) aws_location = os.path.dirname(os.path.abspath(inspect.getsourcefile(aws))) assert len(aws_location) < len( current_location ), "Using wrong aws module, delete aws.py in current directory." # job launches are asynchronous, can spin up multiple jobs in parallel if args.placement: placement_name = args.run else: placement_name = '' print("Launching job") job = aws.simple_job(args.run, num_tasks=2, instance_type='c5.18xlarge', install_script=INSTALL_SCRIPT, placement_group=placement_name) # block until things launch to run commands job.wait_until_ready() # start ray on head node head_task = job.tasks[0] head_task.run('ray stop || echo "ray not started, ignoring"') head_task.run("ray start --head --redis-port=%d --num-gpus=0 \ --num-cpus=10000 --num-workers=10" % (DEFAULT_PORT, )) # start ray on slave node slave_task = job.tasks[1] slave_task.run('ray stop || echo "ray not started, ignoring"') slave_task.run("ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 \ --num-workers=0" % (head_task.ip, DEFAULT_PORT)) # download benchmark script and exeucte it on slave node slave_task.run("rm -f " + SCRIPT_NAME) slave_task.upload(SCRIPT_NAME) slave_task.run("python %s \ --redis-address=%s:%d --num-workers=10 \ --num-parameter-servers=4 \ --data-size=100000000" % (SCRIPT_NAME, head_task.ip, DEFAULT_PORT)) print("To see results:") print(slave_task.connect_instructions)
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws assert 'AWS_DEFAULT_REGION' in os.environ assert os.environ.get("AWS_DEFAULT_REGION") in ami_dict assert os.environ.get( "AWS_DEFAULT_REGION" ) == 'us-west-2', "Currently EFS is hardwired to us-west-2 region" AMI = 'ami-0def3275' # Ubuntu 16.04 in us-west-2 worker_job = aws.simple_job(args.name + '-worker', num_tasks=1, instance_type=args.worker_instance, install_script=INSTALL_SCRIPT, ami=AMI) tb_job = aws.simple_job(args.name + '-tb', num_tasks=1, instance_type=args.tb_instance, install_script=INSTALL_SCRIPT, ami=AMI) # block until things launch to run commands worker_job.wait_until_ready() tb_job.wait_until_ready() worker = worker_job.tasks[0] tb = tb_job.tasks[0] logdir = '/efs/runs/' + args.name worker.run('python cifar10_main.py --model_dir=' + logdir) tb.run('tensorboard --logdir=' + logdir) print("Connect to worker:") print(worker.connect_instructions) print("See tensorboard at http://%s:%d" % (tb.public_ip, 6006))
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws job = aws.simple_job(args.name, num_tasks=args.num_tasks, instance_type=args.instance_type, install_script=INSTALL_SCRIPT, ami=AMI, linux_type='debian') # block until things launch to run commands job.wait_until_ready() for task_num, task in enumerate(job.tasks): for i in range(args.num_tasks): task.run("ping worker%d.tf.local -c 1" % (i, )) time.sleep(1) print("To connect to task %d: " % (i, )) print(task.connect_instructions)
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws # job launches are asynchronous, can spin up multiple jobs in parallel if args.placement: placement_name = args.run else: placement_name = '' job = aws.simple_job(args.run, num_tasks=2, instance_type='c5.18xlarge', install_script=INSTALL_SCRIPT, placement_group=placement_name, ami=AMI, linux_type='debian') # block until things launch to run commands job.wait_until_ready() head_task = job.tasks[0] head_task.run( "ray start --head --redis-port=%d --num-gpus=0 --num-cpus=10000 --num-workers=10" % (DEFAULT_PORT, )) slave_task = job.tasks[1] script_name = os.path.basename(BENCHMARK_URL) slave_task.run("rm -f " + script_name) slave_task.run("wget " + BENCHMARK_URL) slave_task.run( "ray start --redis-address %s:%d --num-gpus=4 --num-cpus=4 --num-workers=0" % (head_task.ip, DEFAULT_PORT)) slave_task.run( "python async_sgd_benchmark_multinode.py --redis-address=%s:%d --num-workers=10 --num-parameter-servers=4 --data-size=100000000" % (head_task.ip, DEFAULT_PORT)) print("To see results:") print(slave_task.connect_instructions)
def main(): if args.role == 'launcher': add_parent_path() import aws job = aws.simple_job('simple', num_tasks=1, install_script=INSTALL_SCRIPT) task = job.tasks[0] job.wait_until_ready() task.upload(__file__) # copies current script onto machine task.run("python %s --role=worker" % (__file__,)) # runs script and streams output locally to file in /temp print("To connect:") print(task.connect_instructions) elif args.role == 'worker': import sys, time print('hello world') print('Python version is '+str(sys.version)) for i in range(10000): time.sleep(1) print("step %d"%(i,)) else: print('Unknown role')
def main(): import aws import os # job launches are asynchronous, can spin up multiple jobs in parallel job = aws.simple_job('ray', num_tasks=2, install_script=INSTALL_SCRIPT) # block until things launch to run commands job.wait_until_ready() head_task = job.tasks[0] head_task.run("ray start --head --redis-port=%d" % (DEFAULT_PORT, )) slave_task = job.tasks[1] slave_task.run("ray start --redis-address %s:%d" % (head_task.ip, DEFAULT_PORT)) script_name = os.path.basename(BENCHMARK_URL) slave_task.run("rm -f " + script_name) slave_task.run("wget " + BENCHMARK_URL) slave_task.run("python " + script_name) print("To see results:") print(slave_task.connect_instructions)