Пример #1
0
print "Init emr connection"
conn = EmrConnection(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

print "Setting up streamStep"
result = output_folder + str(time.time())
step = StreamingStep(name='My word example',
                     mapper='s3n://%s/%s' % (bucket_name, mapper),
                     reducer='s3n://%s/%s' % (bucket_name, reducer),
                     input='s3n://%s/%s' % (bucket_name, input_folder),
                     output='s3n://%s/%s' % (bucket_name, result))

# to start a job if we need it again, we have to save job id and we can just keep adding steps to it. Otherwise we must set keep_alive to false to stop the job (and EC2 instance) after running.
# Good idea is probably to do all your calculations and then terminate the job. Amazon charges hourly, so if we need a job just for 5 min, we still pay for hour. Also with reusing jobs, we reduce the
# waiting for instance to get setup(5-7min).
try:
    jobid = conn.list_clusters(cluster_states="WAITING").clusters[0].id
    print "We have an existing job waiting - Id: %s" % jobid
    conn.add_jobflow_steps(jobid, [step])
except IndexError, e:
    print "Starting ERM job %s" % jobname
    jobid = conn.run_jobflow(name=jobname, steps=[step], log_uri="s3://"+bucket_name+"/logs/", enable_debugging = True, keep_alive=True)

#Wait for 5s to refresh the states
time.sleep(10)

status = conn.describe_jobflow(jobid).state
while not (status in ['WAITING', 'COMPLETED']):
    status = conn.describe_jobflow(jobid).state
    print "Status: %s, Job Id: %s" % (status, jobid)
    time.sleep(10)
Пример #2
0
  #   mapred.max.split.size=1
  #   mapred.min.split.size
  #   mapred.map.tasks = 20
  #   mapred.task.timeout=800000
  #   mapred.tasktracker.map.tasks.maximum
  #   mapred.tasktracker.reduce.tasks.maximum

  # Define an action that will bootstrap these machines and install our needed
  #  dependencies
  bootstrap_action = BootstrapAction("Install Dependiences",
      's3://%s/%s' % (bucket_emr, file_bootstrapper),
      None)


  # if an existing cluster is running, let's use that
  running_clusters = conn.list_clusters(cluster_states="WAITING").clusters
  if len(running_clusters) > 0:
    job_id = running_clusters[0].id
    print "A waiting cluster is already setup.  Running Job on : %s" % job_id
    conn.add_jobflow_steps(job_id, [step])
  else:
    print 'No waiting EMR clusters found.  Starting new cluster with name: %s' % jobname
    answer = raw_input('Do you want to start a new cluster? [N/y]: ')
    if 'y' not in answer.lower():
      print 'exiting'
      exit()

    print 'creating a new EMR cluster'

    job_id = conn.run_jobflow(name=jobname,
              steps=[step],