def main(): import aws_backend run = aws_backend.make_run(args.name, ami=args.ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('main', instance_type=args.instance_type) job.wait_until_ready() print(job.connect_instructions) # if tensorboard is running, kill it, it will prevent efs logdir from being # deleted job.run("tmux kill-session -t tb || echo ok") logdir = '/efs/runs/%s/%s' % (args.group, args.name) job.run('rm -Rf %s || echo failed' % (logdir, )) # delete prev logs # Launch tensorboard visualizer in separate tmux session job.run("tmux new-session -s tb -n 0 -d") job.run("tmux send-keys -t tb:0 'source activate mxnet_p36' Enter") job.run("tmux send-keys -t tb:0 'tensorboard --logdir %s' Enter" % (logdir, )) job.run('source activate mxnet_p36') job.run('killall python || echo failed') # kill previous run job.run( 'pip install -U https://s3.amazonaws.com/inferno-dlami/tensorflow/p3/tensorflow-1.5.0-cp36-cp36m-linux_x86_64.whl' ) job.upload('imagenet_utils.py') job.upload('resnet_model.py') job.upload('resnet.b512.baseline.py') job.run_async('python resnet.b512.baseline.py --logdir=%s' % (logdir, ))
def main(): run = aws_backend.make_run(args.name, ami_name=args.ami_name, availability_zone=args.zone, linux_type=args.linux_type, skip_efs_mount=args.skip_efs_mount) create_job(run, 'worker', args.num_tasks)
def main(): import aws_backend # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, ami=args.ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('main', instance_type=args.instance_type) job.wait_until_ready() print(job.connect_instructions) # if tensorboard is running, kill it, it will prevent efs logdir from being # deleted job.run("tmux kill-session -t tb || echo ok") job.run('rm -Rf /efs/runs/yuxin_numpy/mnist-convnet || echo failed' ) # delete prev logs # Launch tensorboard visualizer in separate tmux session job.run("tmux new-session -s tb -n 0 -d") job.run("tmux send-keys -t tb:0 'source activate mxnet_p36' Enter") job.run( "tmux send-keys -t tb:0 'tensorboard --logdir /efs/runs/yuxin_numpy' Enter" ) job.run('source activate mxnet_p36') job.upload(module_path + '/mnist-convnet.py') job.run('killall python || echo failed') # kill previous run job.run_async('python mnist-convnet.py')
def main(): import aws_backend run = aws_backend.make_run(args.name, ami=args.ami, availability_zone=args.zone, linux_type=args.linux_type) create_job(run, args.job_name, args.num_tasks)
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) install_script = '' ami = args.ami # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() job.run('source activate mxnet_p36') job.run('sudo apt install -y fio') job.run('volume=/dev/xvda1') job.run( 'time sudo fio --filename=$volume --rw=read --bs=128k --iodepth=32 --ioengine=libaio --direct=1 --name=volume-initialize' )
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) worker_job = run.make_job('worker', instance_type=args.instance_type, num_tasks=2) ps_job = run.make_job('ps', instance_type=args.instance_type, num_tasks=2) worker_job.wait_until_ready() ps_job.wait_until_ready() worker_job.tasks[0].run_async('sudo iperf3 -s -p 6006') worker_job.tasks[1].run('sudo iperf3 -c %s -P 10 -i 1 -t 60 -V -p 6006' % (worker_job.tasks[0].ip, )) print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print()
def main(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import util as u u.maybe_create_resources() run = aws_backend.make_run(args.name, ami_name=args.ami_name) job = run.make_job('worker', instance_type=args.instance_type, use_spot=args.spot) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() if args.mode == 'jupyter': # upload notebook config with provided password from notebook.auth import passwd sha = passwd(args.password) local_config_fn = f'{module_path}/jupyter_notebook_config.py' temp_config_fn = '/tmp/' + os.path.basename(local_config_fn) remote_config_fn = f'/home/ubuntu/.jupyter/{os.path.basename(local_config_fn)}' os.system(f'cp {local_config_fn} {temp_config_fn}') _replace_lines(temp_config_fn, 'c.NotebookApp.password', f"c.NotebookApp.password = '******'") job.upload(temp_config_fn, remote_config_fn) # upload sample notebook and start server job.run('mkdir -p /efs/notebooks') job.upload(f'{module_path}/sample.ipynb', '/efs/notebooks/sample.ipynb', dont_overwrite=True) job.run('cd /efs/notebooks') job.run_async('jupyter notebook') print(f'Jupyter notebook will be at http://{job.public_ip}:8888') elif args.mode == 'tf-benchmark': job.run('source activate tensorflow_p36') job.upload(__file__) job.run('killall python || echo pass') # kill previous run job.run_async('python launch.py --internal-role=worker') else: assert False, "Unknown --mode, must be jupyter or tf-benchmark."
def main(): run = aws_backend.make_run(args.name, ami=args.ami, ami_name=args.ami_name, availability_zone=args.zone, linux_type=args.linux_type, skip_efs_mount=(not args.mount_efs)) job = create_job(run, args.job_name, args.num_tasks) # Define custom params for training or use a preset above params = eval(args.params) start_training( job, params, save_tag='testing_refactor', )
def main(): params = eval(args.params) assert args.num_tasks == -1, "num-tasks is deprecated, it's now specified along with training parameters as --num-tasks." assert args.ami_name == '-1', "ami_name is deprecated, it's now specified along with training parameters as --ami-name." ami_name = _extract_ami_name(params) num_tasks = _extract_num_tasks(params) env_name = _extract_env_name(params) run = aws_backend.make_run(args.name, ami_name=ami_name, skip_efs_mount=args.skip_efs_mount) job = create_job(run, 'worker', num_tasks, env_name) run.setup_logdir() # must happen after first job is created and ready # Define custom params for training or use a preset above # TODO: move "save_tag" into command-line parameter start_training(job, params, save_tag=args.name)
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) ami_dict = ami_dict_ubuntu if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] user_data = """#!/bin/bash sudo mkdir -p /efs sudo chmod 777 /efs echo 'Running user-data!' echo 'test' > /home/ubuntu/test.txt echo 'activating pytorch_p36' source /home/ubuntu/anaconda3/bin/activate pytorch_p36 echo $PS1 echo $PS1 > /home/ubuntu/test2.txt pip install ray echo 'INSTALLED ray' echo 'INSTALLED ray' > /home/ubuntu/test3.txt """ # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script='', ami=ami, availability_zone=args.zone, linux_type=args.linux_type, user_data=user_data) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate mxnet_p36') # as of Jan 26, official version gives incompatible numpy error, so pin to nightly # job.run('pip install tensorflow-gpu') # job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl') job.run( 'pip install -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl' ) job.upload(__file__) job.run('killall python || echo failed') # kill previous run job.run_async('python %s --role=worker' % (os.path.basename(__file__)))
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('gpubox', instance_type=args.instance) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate mxnet_p36') # as of Jan 26, official version gives incompatible numpy error, so pin to nightly # job.run('pip install tensorflow-gpu') # job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl') # job.run('pip install --default-timeout=100 -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl') job.upload(__file__) job.run('killall python || echo failed') # kill previous run job.run_async('python launch.py --role=worker')
def launcher(): module_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(module_path + '/..') import tmux_backend import aws_backend import create_resources as create_resources_lib import util as u create_resources_lib.create_resources() region = u.get_region() assert args.zone.startswith( region ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % ( args.zone, region) if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: print( "Warning, using provided AMI, make sure that --linux-type argument " "is set correctly") ami = args.ami else: assert region in ami_dict, "Define proper AMI mapping for this region." ami = ami_dict[region] # TODO: add API to create jobs with default run run = aws_backend.make_run(args.name, install_script=install_script, ami=ami, availability_zone=args.zone, linux_type=args.linux_type) job = run.make_job('worker', instance_type=args.instance_type) job.wait_until_ready() print("Job ready for connection, run the following:") print("../connect " + args.name) print("Alternatively run") print(job.connect_instructions) print() print() print() print() job.run('source activate tensorflow_p36') job.run('pip install cython') job.run('pip install ray') # below can fail on # E: Could not get lock /var/lib/dpkg/lock - open (11: Resource temporarily unavailable) job.run('sudo apt install htop') job.run('yes | sudo apt-get install google-perftools') job.run('export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"') job.upload(__file__) job.upload('tf_numpy_benchmark.py') job.run('killall python || echo failed') # kill previous run job.run('python tf_numpy_benchmark.py')