示例#1
0
文件: job.py 项目: SinisaG/AmazonEMR
step = StreamingStep(name='My word example',
                     mapper='s3n://%s/%s' % (bucket_name, mapper),
                     reducer='s3n://%s/%s' % (bucket_name, reducer),
                     input='s3n://%s/%s' % (bucket_name, input_folder),
                     output='s3n://%s/%s' % (bucket_name, result))

# to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running.
# Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the
# waiting for instance to get setup(5-7min).
try:
    jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id
    print "We have an existing job waiting - Id: %s" % jobid
    conn.add_jobflow_steps(jobid, [step])
except IndexError, e:
    print "Starting ERM job %s" % jobname
    jobid = conn.run_jobflow(name=jobname, steps=[step], log_uri="s3://"+bucket_name+"/logs/", enable_debugging = True, keep_alive=True)

#Wait for 5s to refresh the states
time.sleep(10)

status = conn.describe_jobflow(jobid).state
while not (status in ['WAITING', 'COMPLETED']):
    status = conn.describe_jobflow(jobid).state
    print "Status: %s, Job Id: %s" % (status, jobid)
    time.sleep(10)

print "Job finished"

#Wait for few s to write the files
time.sleep(10)
示例#2
0
    print "A waiting cluster is already setup.  Running Job on : %s" % job_id
    conn.add_jobflow_steps(job_id, [step])
  else:
    print 'No waiting EMR clusters found.  Starting new cluster with name: %s' % jobname
    answer = raw_input('Do you want to start a new cluster? [N/y]: ')
    if 'y' not in answer.lower():
      print 'exiting'
      exit()

    print 'creating a new EMR cluster'

    job_id = conn.run_jobflow(name=jobname,
              steps=[step],
              log_uri="s3://"+bucket_emr+"/logs/",
              bootstrap_actions=[bootstrap_action],
              enable_debugging = True,
              keep_alive=True,
              master_instance_type='m1.small', # I like m3.xlarge this b/c networking == high and cost is $.35/instance/hour,
              slave_instance_type='m1.small',
              num_instances=4,
              ec2_keyname=EC2_KEYNAME)
              #instance_groups =  # OPTIONAL: Add Spot Support)

  # OPTIONAL: Add Spot Support
  #ig = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
  #c.add_instance_groups(jf.jobflowid, ig)

  time.sleep(20)
  status = conn.describe_jobflow(job_id).state
  while status not in ['WAITING', 'COMPLETED']:
    status = conn.describe_jobflow(job_id).state
    print "Status: %s for Job Id: %s" % (status, job_id)