示例#1
0
def test_stream_output():
  name = "testjob"
  job = aws.tf_job(name, 4)
  job.wait_until_ready()
  task = job.tasks[0]
  task.run('cd Dropbox && ls') 
  time.sleep(0.5)  # async ... todo: expose thread and join instead of sleep?
  os.system('cat '+task.last_stdout)
示例#2
0
def test_new_job():
  name = "testjob"
  instances = toby_aws.LookupAwsInstances(instance_tag=name)
  assert not instances, "Instances already exist, kill them first"

  job = aws.tf_job(name, 2)
  instances = toby_aws.LookupAwsInstances(instance_tag=name)
  assert len(instances) == 2
示例#3
0
def test_stream_output():
    name = "testjob"
    job = aws.tf_job(name, 4)
    job.wait_until_ready()
    task = job.tasks[0]
    task.run('cd Dropbox && ls')
    time.sleep(0.5)  # async ... todo: expose thread and join instead of sleep?
    os.system('cat ' + task.last_stdout)
示例#4
0
def test_new_job():
    name = "testjob"
    instances = toby_aws.LookupAwsInstances(instance_tag=name)
    assert not instances, "Instances already exist, kill them first"

    job = aws.tf_job(name, 2)
    instances = toby_aws.LookupAwsInstances(instance_tag=name)
    assert len(instances) == 2
示例#5
0
def launch_aws():
    ps_job = aws.tf_job(FLAGS.run + '-ps', FLAGS.num_ps)
    worker_job = aws.tf_job(FLAGS.run + '-worker',
                            FLAGS.num_workers,
                            placement_group='tf')
    tb_job = aws.tf_job(FLAGS.run + '-tb', 1, placement_group='tf')

    # wait for everything to come up

    # todo: private IP's may be known before instances are ready
    ps_job.wait_for_ready()
    worker_job.wait_for_ready()

    # TODO: orchestration may be easier if I save server spec to a predictable
    # location on AWS rather than passing it to each worker through command-line

    # Orchestration: every worker needs to know:
    # 1. their own role (task_spec), ie {type: worker, index: 0}
    # 2. role->ip mapping of all machines (cluster_spec), ie
    #    {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}}
    ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks]
    worker_hosts = [
        "%s:%d" % (task.ip, task.port) for task in worker_job.tasks
    ]
    cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts}

    # launch parameter server tasks
    task_type = 'ps'
    for task in ps_job.tasks:
        task_spec = {'type': task_type, 'index': task.id}
        task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec))
        task.run(PS_CMD)

    # launch worker tasks
    task_type = 'worker'  # task type can also be "chief", overlapping with worker
    for task in worker_job.tasks:
        task_spec = {'type': task_type, 'index': task.id}
        task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec))
        task.run(WORKER_CMD)

    # launch tensorboard visualizer
    task = tb_job.tasks[0]
    task.run('tensorboard --port=%d --logdir=%s' % (task.port, logdir))
示例#6
0
def test_send_file():
  name = "testjob"
  job = aws.tf_job(name, 4)
  job.wait_until_ready()
  task0 = job.tasks[0]
  secret_word = "testfile3"
  os.system("echo '%s' > upload_test.txt"%(secret_word,))
  task0.upload('upload_test.txt')
  stdout,stderr = task0.run_sync("cat upload_test.txt")
  print(stdout)    # => testfile2
  assert stdout.strip() == secret_word
示例#7
0
def test_send_file():
    name = "testjob"
    job = aws.tf_job(name, 4)
    job.wait_until_ready()
    task0 = job.tasks[0]
    secret_word = "testfile3"
    os.system("echo '%s' > upload_test.txt" % (secret_word, ))
    task0.upload('upload_test.txt')
    stdout, stderr = task0.run_sync("cat upload_test.txt")
    print(stdout)  # => testfile2
    assert stdout.strip() == secret_word
示例#8
0
def launch_aws():
  ps_job = aws.tf_job(FLAGS.run+'-ps', FLAGS.num_ps)
  worker_job = aws.tf_job(FLAGS.run+'-worker', FLAGS.num_workers, placement_group='tf')
  tb_job = aws.tf_job(FLAGS.run+'-tb', 1, placement_group='tf')

  # wait for everything to come up

  # todo: private IP's may be known before instances are ready
  ps_job.wait_for_ready()
  worker_job.wait_for_ready()
  
  # TODO: orchestration may be easier if I save server spec to a predictable
  # location on AWS rather than passing it to each worker through command-line
  
  # Orchestration: every worker needs to know:
  # 1. their own role (task_spec), ie {type: worker, index: 0}
  # 2. role->ip mapping of all machines (cluster_spec), ie
  #    {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}}
  ps_hosts = ["%s:%d"%(task.ip, task.port) for task in ps_job.tasks]
  worker_hosts = ["%s:%d"%(task.ip, task.port) for task in worker_job.tasks]
  cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts}

  # launch parameter server tasks
  task_type = 'ps'  
  for task in ps_job.tasks:
    task_spec = {'type': task_type, 'index': task.id}
    task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec))
    task.run(PS_CMD)

  # launch worker tasks
  task_type = 'worker' # task type can also be "chief", overlapping with worker
  for task in worker_job.tasks:
    task_spec = {'type': task_type, 'index': task.id}
    task.run(generate_tf_env_setup_cmd(cluster_spec, task_spec))
    task.run(WORKER_CMD)

  # launch tensorboard visualizer
  task = tb_job.tasks[0]
  task.run('tensorboard --port=%d --logdir=%s'%(task.port, logdir))
示例#9
0
def launcher(do_local=False):
  if FLAGS.cluster == 'local':
    import tmux
    job = tmux.tf_job('myjob', 1)
  elif FLAGS.cluster == 'aws':
    import aws
    job = aws.tf_job('myjob', 1)
  else:
    assert False, "Unknown cluster "+FLAGS.cluster

  task = job.tasks[0]
  task.upload(__file__)   # copies current script onto machine
  setup_cmd =  ("source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && "
                "source activate tf")
  task.run("%s && python %s --role=worker" % (setup_cmd, __file__,))
  
  print("To see the output: tail -f %s" %(task.last_stdout))
  print("To interact with the task, do "+task.connect_instructions)
示例#10
0
def cnn_launcher():
  """Experiment launcher."""

  import boto3
  ec2 = boto3.client('ec2')
  
  if not FLAGS.disable_placement:
    placement_group = FLAGS.run
    try:
      response = ec2.create_placement_group(GroupName=placement_group,
                                            Strategy='cluster')
    except Exception as e:
      if 'Duplicate' in e.response['Error']['Code']:
        print("Warning, placement group %s already exists, skipping" %(placement_group,))
        print("Got message "+str(e))
  else:
    placement_group = ''

  ps_job = aws.tf_job(FLAGS.run+'/ps', FLAGS.num_ps,
                      instance_type=FLAGS.ps_type,
                      placement_group=placement_group)
  worker_job = aws.tf_job(FLAGS.run+'/worker', FLAGS.num_workers,
                          instance_type=FLAGS.worker_type,
                          placement_group=placement_group)

  ps_job.wait_until_ready()
  worker_job.wait_until_ready()

  # Orchestration: every worker needs to know:
  # 1. their own role (task_spec), ie {type: worker, index: 0}
  # 2. role->ip mapping of all machines (cluster_spec), ie
  #    {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}}

  ps_hosts = ["%s:%d"%(task.ip, task.port) for task in ps_job.tasks]
  ps_hosts_str = ','.join(ps_hosts)
  worker_hosts = ["%s:%d"%(task.ip, task.port) for task in worker_job.tasks]
  worker_hosts_str = ','.join(worker_hosts)
  cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts}

  setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks"

  # kill previous running processes in case we are reusing instances
  for task in ps_job.tasks:
    task.run("killall python")
  for task in worker_job.tasks:
    task.run("killall python")

  time.sleep(5)

  # launch parameter server tasks
  task_type = 'ps'
  cmds = []
  ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s"
  for task in ps_job.tasks:
    cmds = []
    task_spec = {'type': task_type, 'index': task.id}
    cmds.append(setup_cmd)
    cmds.append(tf_config_cmd(cluster_spec, task_spec))
    task.upload("variable_mgr.py",
                "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py")
    cmds.append(ps_cmd_tmpl % {"worker_hosts": worker_hosts_str,
                               "ps_hosts": ps_hosts_str,
                               "job_name": task_type,
                               "task_index": task.id})
    task.run(' && '.join(cmds))
    print("To see the output: tail -f %s" %(task.last_stdout))

  # launch worker tasks
  task_type = 'worker'
  cmds = []
  worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=8 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s"

  for task in worker_job.tasks:
    cmds = []
    task_spec = {'type': task_type, 'index': task.id}
    cmds.append(setup_cmd)

    cmds.append(tf_config_cmd(cluster_spec, task_spec))
    task.upload("variable_mgr.py",
                "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py")
    cmds.append(worker_cmd_tmpl % {"worker_hosts": worker_hosts_str,
                                   "ps_hosts": ps_hosts_str,
                                   "job_name": task_type,
                                   "task_index": task.id})
    task.run(' && '.join(cmds))
    print("To see the output of %s: tail -f %s" %(task.id,
                                                  task.last_stdout))
示例#11
0
def cnn_launcher():
    """Experiment launcher."""

    import boto3
    ec2 = boto3.client('ec2')

    if not FLAGS.disable_placement:
        placement_group = FLAGS.run
        try:
            response = ec2.create_placement_group(GroupName=placement_group,
                                                  Strategy='cluster')
        except Exception as e:
            if 'Duplicate' in e.response['Error']['Code']:
                print("Warning, placement group %s already exists, skipping" %
                      (placement_group, ))
                print("Got message " + str(e))
    else:
        placement_group = ''

    ps_job = aws.tf_job(FLAGS.run + '/ps',
                        FLAGS.num_ps,
                        instance_type=FLAGS.ps_type,
                        placement_group=placement_group)
    worker_job = aws.tf_job(FLAGS.run + '/worker',
                            FLAGS.num_workers,
                            instance_type=FLAGS.worker_type,
                            placement_group=placement_group)

    ps_job.wait_until_ready()
    worker_job.wait_until_ready()

    # Orchestration: every worker needs to know:
    # 1. their own role (task_spec), ie {type: worker, index: 0}
    # 2. role->ip mapping of all machines (cluster_spec), ie
    #    {"worker": ["localhost:24724"], "ps": ["localhost:15960"]}}

    ps_hosts = ["%s:%d" % (task.ip, task.port) for task in ps_job.tasks]
    ps_hosts_str = ','.join(ps_hosts)
    worker_hosts = [
        "%s:%d" % (task.ip, task.port) for task in worker_job.tasks
    ]
    worker_hosts_str = ','.join(worker_hosts)
    cluster_spec = {'worker': worker_hosts, 'ps': ps_hosts}

    setup_cmd = "source ~/.bashrc && export PATH=~/anaconda3/bin:$PATH && source activate py2 && cd ~/git0/benchmarks/scripts/tf_cnn_benchmarks"

    # kill previous running processes in case we are reusing instances
    for task in ps_job.tasks:
        task.run("killall python")
    for task in worker_job.tasks:
        task.run("killall python")

    time.sleep(5)

    # launch parameter server tasks
    task_type = 'ps'
    cmds = []
    ps_cmd_tmpl = "CUDA_VISIBLE_DEVICES='' python tf_cnn_benchmarks.py --local_parameter_device=gpu --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=ps --task_index=%(task_index)s"
    for task in ps_job.tasks:
        cmds = []
        task_spec = {'type': task_type, 'index': task.id}
        cmds.append(setup_cmd)
        cmds.append(tf_config_cmd(cluster_spec, task_spec))
        task.upload(
            "variable_mgr.py",
            "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py"
        )
        cmds.append(
            ps_cmd_tmpl % {
                "worker_hosts": worker_hosts_str,
                "ps_hosts": ps_hosts_str,
                "job_name": task_type,
                "task_index": task.id
            })
        task.run(' && '.join(cmds))
        print("To see the output: tail -f %s" % (task.last_stdout))

    # launch worker tasks
    task_type = 'worker'
    cmds = []
    worker_cmd_tmpl = "python tf_cnn_benchmarks.py --data_format=NCHW --batch_size=64 --num_batches=1000 --model=resnet50 --optimizer=sgd --variable_update=distributed_replicated --cross_replica_sync=True --local_parameter_device=gpu --num_gpus=8 --nodistortions --display_every=10 --worker_hosts=%(worker_hosts)s --ps_hosts=%(ps_hosts)s --job_name=worker --task_index=%(task_index)s"

    for task in worker_job.tasks:
        cmds = []
        task_spec = {'type': task_type, 'index': task.id}
        cmds.append(setup_cmd)

        cmds.append(tf_config_cmd(cluster_spec, task_spec))
        task.upload(
            "variable_mgr.py",
            "/home/ubuntu/Dropbox/git0/benchmarks/scripts/tf_cnn_benchmarks/variable_mgr.py"
        )
        cmds.append(
            worker_cmd_tmpl % {
                "worker_hosts": worker_hosts_str,
                "ps_hosts": ps_hosts_str,
                "job_name": task_type,
                "task_index": task.id
            })
        task.run(' && '.join(cmds))
        print("To see the output of %s: tail -f %s" %
              (task.id, task.last_stdout))
示例#12
0
def test_reuse_job():
  name = "testjob"
  job = aws.tf_job(name, 2)
示例#13
0
def test_reuse_job():
    name = "testjob"
    job = aws.tf_job(name, 2)