Exemplo n.º 1
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    install_script = ''

    ami = args.ami

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    job.run('source activate mxnet_p36')
    job.run('sudo apt install -y fio')
    job.run('volume=/dev/xvda1')
    job.run(
        'time sudo fio --filename=$volume --rw=read --bs=128k --iodepth=32 --ioengine=libaio --direct=1 --name=volume-initialize'
    )
Exemplo n.º 2
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    if args.placement:
        placement_group = args.run
    else:
        placement_group = ''

    if args.run_local:
        backend = tmux_backend
        run = backend.make_run(args.name)
    else:
        create_resources_lib.create_resources()
        region = u.get_region()
        ami = 'ami-e580c79d'
        backend = aws_backend
        run = backend.make_run(args.name, ami=ami, availability_zone=args.zone)
    job = run.make_job('mpi',
                       instance_type=args.instance,
                       num_tasks=2,
                       placement_group=placement_group)

    job.wait_until_ready()

    print(
        "Job ready for connection, to connect to most recent task, run the following:"
    )
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    print("Task internal IPs")
    for task in job.tasks:
        print(task.ip)

    job.upload(__file__)
    if not args.run_local:
        job.run('killall python || echo failed')  # kill previous run
        job.run('source activate pytorch_p36')

    job.tasks[0].run(
        'python launch_mpi_test.py --role=worker --rank=0 --size=2 --master-addr='
        + job.tasks[0].ip,
        sync=False)
    job.tasks[1].run(
        'python launch_mpi_test.py --role=worker --rank=1 --size=2 --master-addr='
        + job.tasks[0].ip,
        sync=False)
Exemplo n.º 3
0
def main():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    worker_job = run.make_job('worker',
                              instance_type=args.instance_type,
                              num_tasks=2)
    ps_job = run.make_job('ps', instance_type=args.instance_type, num_tasks=2)
    worker_job.wait_until_ready()
    ps_job.wait_until_ready()

    worker_job.tasks[0].run_async('sudo iperf3 -s -p 6006')
    worker_job.tasks[1].run('sudo iperf3 -c %s -P 10 -i 1 -t 60 -V -p 6006' %
                            (worker_job.tasks[0].ip, ))
    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()
Exemplo n.º 4
0
def maybe_create_resources():
    """Use heuristics to decide to possibly create resources"""
    def do_create_resources():
        """Check if gateway, keypair, vpc exist."""
        resource = u.get_resource_name()
        if u.get_keypair_name() not in u.get_keypair_dict():
            return True
        vpcs = u.get_vpc_dict()
        if resource not in vpcs:
            return True
        vpc = vpcs[resource]
        gateways = u.get_gateway_dict(vpc)
        if resource not in gateways:
            return True
        return False

    if do_create_resources():
        import create_resources as create_resources_lib
        create_resources_lib.create_resources()
Exemplo n.º 5
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    ami_dict = ami_dict_ubuntu

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    user_data = """#!/bin/bash
sudo mkdir -p /efs
sudo chmod 777 /efs
echo 'Running user-data!'
echo 'test' > /home/ubuntu/test.txt
echo 'activating pytorch_p36'
source /home/ubuntu/anaconda3/bin/activate pytorch_p36
echo $PS1
echo $PS1 > /home/ubuntu/test2.txt
pip install ray
echo 'INSTALLED ray'
echo 'INSTALLED ray' > /home/ubuntu/test3.txt
"""

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script='',
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type,
                               user_data=user_data)

    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    job.run(
        'pip install -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl'
    )

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python %s --role=worker' % (os.path.basename(__file__)))
Exemplo n.º 6
0
def launch(backend, install_script='', init_cmd=''):
    if args.placement:
        placement_group = args.name
    else:
        placement_group = ''

    num_tasks = args.workers + args.ps
    run_local = False if backend.__name__ == 'aws_backend' else True

    if run_local:
        run = backend.make_run(args.name, install_script=install_script)
        job = run.make_job('worker', num_tasks)
    else:
        region = u.get_region()
        assert args.zone.startswith(
            region
        ), "Your specified zone is %s but your region (from AWS_DEFAULT_REGION) is %s, please specify zone correctly, such as --zone=%sa" % (
            args.zone, region, region)
        create_resources_lib.create_resources()
        ami = ami_dict_ubuntu[u.get_region()]
        run = backend.make_run(args.name,
                               user_data=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               skip_efs_mount=True)
        job = run.make_job('worker',
                           num_tasks=num_tasks,
                           instance_type=args.instance,
                           placement_group=placement_group)

    for job in run.jobs:
        job.wait_until_ready()

    head_task = job.tasks[0]  # worker 0 is also the head node
    head_task.upload('ray_adder.py')
    head_task.upload('../util.py')  # just in case?

    # todo: use task.port instead of DEFAULT_PORT
    run.run(init_cmd)
    run.run('ray stop || echo "ignoring error"')
    if args.omp_threads:
        run.run('export OMP_NUM_THREADS=' + str(args.omp_threads))

    # Ray start for head node. When running locally, specify more gpus since
    # all workers go on same machine
    ray_cmd = "ray start --head --redis-port=%d --num-workers=0" % (
        DEFAULT_PORT, )
    if run_local:
        ray_cmd += ' --num-gpus=10'
    else:
        ray_cmd += ' --num-gpus=1'

    head_task.run(ray_cmd)

    # Ray start command for leaf nodes
    if not run_local:
        ray_cmd = "ray start --redis-address %s:%d --num-gpus=1 --num-workers=0" % (
            head_task.ip, DEFAULT_PORT)
        for task in job.tasks[1:]:
            task.run(ray_cmd)

    client_cmd = 'python ray_adder.py --redis-address %s:%d --size-mb %d' % (
        head_task.ip, DEFAULT_PORT, args.size_mb)
    client_cmd += ' --iters %d --workers %d --ps %d' % (args.iters,
                                                        args.workers, args.ps)
    if args.memcpy_threads:
        client_cmd += ' --memcpy-threads %d' % (args.memcpy_threads, )

    if not run_local:
        client_cmd += ' --enforce-different-ips=1'
    head_task.run('rm log.txt || echo nevermind')
    head_task.run(client_cmd, sync=False)

    log("Streaming log.txt of task[0]")
    job.tasks[0].stream_file('log.txt')
def etl():

    print('Creating Resources...')
    create_resources()
    print('AWS Resources have been created.')

    # Connecting to Redshift Cluster
    print('Initiate ETL...')
    print('Connecting to Redshift Cluster...')
    cur, conn = create_connection()

    print('Dropping Tables...')
    drop_tables(cur, conn)

    print('Creating Tables...')
    create_tables(cur, conn)

    print('Loading Staging Tables...')
    load_staging_tables(cur, conn)

    print('Loading Fact & Dimension Tables...')
    insert_tables(cur, conn)

    print('Closing Cluster Connection...')
    conn.close()

    # Creates an empty list to validate inputs by user
    answer_list = ['Y', 'N']

    # Calls for an infinite loop that keeps executing until user enter a valid question number
    while True:
        try:
            answer = str(
                input(
                    "Would you like to run some validation queries? Please enter [y] or [n]: "
                )).upper()
            if answer == 'N':
                answer = str(
                    input(
                        "This will delete all AWS Resources. Do you want to proceed? Please enter [y] or [n]: "
                    )).upper()
                if answer == 'Y':

                    # Delete Resources before exit program.
                    print('Deleting Resources...')
                    delete_resources()

                    # Exit Program
                    print('Exiting Script... Goodbye! \n')
                    sys.exit()

                else:
                    continue
            elif answer not in answer_list:
                print(
                    "Error! This is not a answer. These are valids ANSWERS {} "
                    .format(answer_list))
            else:
                validation_queries()

        # This is the exception called the attempt to convert the input to integer
        except ValueError:
            # The cycle will go on until validation
            print("Error! This is not a letter. Try again.")
Exemplo n.º 8
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')  # aws_backend.py is one level up
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    if args.placement:
        placement_group = args.name
    else:
        placement_group = ''

    if not args.zone:
        backend = tmux_backend
        run = backend.make_run(args.name)
    else:
        region = u.get_region()
        print("Using region", region)
        assert args.zone.startswith(
            region
        ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
            args.zone, region)

        if args.ami:
            print(
                "Warning, using provided AMI, make sure that --linux-type argument "
                "is set correctly")
            ami = args.ami
        else:
            assert region in ami_dict, "Define proper AMI mapping for this region."
            ami = ami_dict[region]

        create_resources_lib.create_resources()
        region = u.get_region()
        backend = aws_backend
        run = backend.make_run(args.name,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)

    job = run.make_job('worker',
                       instance_type=args.instance_type,
                       num_tasks=args.num_machines,
                       placement_group=placement_group)
    job.wait_until_ready()

    print(
        "Job ready for connection, to connect to most recent task, run the following:"
    )
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    print("Task internal IPs")
    for task in job.tasks:
        print(task.ip)

    job.upload(__file__)
    if args.zone:
        job.run('killall python || echo failed')  # kill previous run
        job.run('source activate pytorch_p36')

    script_name = os.path.basename(__file__)
    for worker_idx in range(args.num_machines):
        cmd = 'python %s --role=worker --rank=%d --data-size-mb=%d --num-machines=%d --master-addr=%s' % (
            script_name, worker_idx, args.data_size_mb, args.num_machines,
            job.tasks[0].ip)
        job.tasks[worker_idx].run(cmd, sync=False)
Exemplo n.º 9
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('gpubox', instance_type=args.instance)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate mxnet_p36')
    # as of Jan 26, official version gives incompatible numpy error, so pin to nightly
    # job.run('pip install tensorflow-gpu')
    #  job.run('pip install -U https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.6.0.dev20180126-cp36-cp36m-manylinux1_x86_64.whl')
    #  job.run('pip install --default-timeout=100 -U http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl')

    job.upload(__file__)
    job.run('killall python || echo failed')  # kill previous run
    job.run_async('python launch.py --role=worker')
Exemplo n.º 10
0
def launcher():
    module_path = os.path.dirname(os.path.abspath(__file__))
    sys.path.append(module_path + '/..')
    import tmux_backend
    import aws_backend
    import create_resources as create_resources_lib
    import util as u

    create_resources_lib.create_resources()
    region = u.get_region()
    assert args.zone.startswith(
        region
    ), "Availability zone %s must be in default region %s. Default region is taken from environment variable AWS_DEFAULT_REGION" % (
        args.zone, region)

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        print(
            "Warning, using provided AMI, make sure that --linux-type argument "
            "is set correctly")
        ami = args.ami
    else:
        assert region in ami_dict, "Define proper AMI mapping for this region."
        ami = ami_dict[region]

    # TODO: add API to create jobs with default run
    run = aws_backend.make_run(args.name,
                               install_script=install_script,
                               ami=ami,
                               availability_zone=args.zone,
                               linux_type=args.linux_type)
    job = run.make_job('worker', instance_type=args.instance_type)

    job.wait_until_ready()

    print("Job ready for connection, run the following:")
    print("../connect " + args.name)
    print("Alternatively run")
    print(job.connect_instructions)
    print()
    print()
    print()
    print()

    job.run('source activate tensorflow_p36')
    job.run('pip install cython')
    job.run('pip install ray')
    # below can fail on
    # E: Could not get lock /var/lib/dpkg/lock - open (11: Resource temporarily unavailable)
    job.run('sudo apt install htop')

    job.run('yes | sudo apt-get install google-perftools')
    job.run('export LD_PRELOAD="/usr/lib/libtcmalloc.so.4"')

    job.upload(__file__)
    job.upload('tf_numpy_benchmark.py')
    job.run('killall python || echo failed')  # kill previous run
    job.run('python tf_numpy_benchmark.py')