Пример #1
0
    def dumpEMRClusters(self):
        '''
      Method to dump EMR clusters info.
    '''

        try:
            if self.botoprfl[0] != "default":
                conn = boto.connect_emr(profile_name=self.botoprfl)
            else:
                conn = boto.connect_emr()
            if conn:
                print("\n<Start of EMR clusters>\n")
                print(" Jobflows: %s" % conn.describe_jobflows())
                self.opygenericroutines.prntLogErrWarnInfo('',
                                                           'info',
                                                           bresume=True)
                for c in conn.list_clusters().clusters:
                    ec = " %s" % c
                    self.opygenericroutines.prntLogErrWarnInfo(str(ec),
                                                               'info',
                                                               bresume=True)
                self.opygenericroutines.prntLogErrWarnInfo('',
                                                           'info',
                                                           bresume=True)
                print("\n<End of EMR clusters>\n")
        except Exception, e:
            serr = (
                '%s :: dumpEMRClusters(...) : connect_emr,list_clusters(...).clusters, '
                '%s' % (self.sclsnme, str(e)))
            prntErrWarnInfo(serr, bresume=True)
Пример #2
0
def test_create_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name="My wordcount example",
        mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py",
        reducer="aggregate",
        input="s3n://elasticmapreduce/samples/wordcount/input",
        output="s3n://output_bucket/output/wordcount_output",
    )

    job_id = conn.run_jobflow(name="My jobflow", log_uri="s3://some_bucket/jobflow_logs", steps=[step1])

    instance_group = InstanceGroup(6, "TASK", "c1.medium", "SPOT", "spot-0.07", "0.07")
    instance_group = conn.add_instance_groups(job_id, [instance_group])
    instance_group_id = instance_group.instancegroupids
    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(6)
    instance_group = job_flow.instancegroups[0]
    instance_group.instancegroupid.should.equal(instance_group_id)
    int(instance_group.instancerunningcount).should.equal(6)
    instance_group.instancerole.should.equal("TASK")
    instance_group.instancetype.should.equal("c1.medium")
    instance_group.market.should.equal("SPOT")
    instance_group.name.should.equal("spot-0.07")
    instance_group.bidprice.should.equal("0.07")
Пример #3
0
	def get_elapsed_time_emr(job, emrid):
		"""Get elapsed time for EMR job with job flow id emrid, based on EMR job information."""
		emr = boto.connect_emr()
		jobflow = emr.describe_jobflow(emrid)
		emr.close()
	
		try:
			steps = [s for s in jobflow.steps if int(s.name.split("-")[1]) == job.id]
		except IndexError:
			try:
				stepcount = -2 if jobflow.steps[-1].name == "SimpleJoin" else -1
				starttime = datetime.datetime.strptime(jobflow.steps[stepcount].creationdatetime, '%Y-%m-%dT%H:%M:%SZ')
			except AttributeError as e:
				starttime = datetime.datetime.strptime(jobflow.startdatetime, '%Y-%m-%dT%H:%M:%SZ')
			except:
				starttime = datetime.datetime.strptime(jobflow.steps[-1].creationdatetime, '%Y-%m-%dT%H:%M:%SZ')
	
			try:
				endtime = datetime.datetime.strptime(jobflow.steps[-1].enddatetime, '%Y-%m-%dT%H:%M:%SZ')
			except AttributeError:
				endtime = datetime.datetime.today()	
			except:
				endtime = datetime.datetime.strptime(jobflow.steps[-1].enddatetime, '%Y-%m-%dT%H:%M:%SZ')
		else:
			starttime = datetime.datetime.strptime(steps[0].creationdatetime, '%Y-%m-%dT%H:%M:%SZ')
			endtime = datetime.datetime.strptime(steps[-1].enddatetime, '%Y-%m-%dT%H:%M:%SZ')

		return (endtime-starttime)		
Пример #4
0
def test_bootstrap_actions():
    bootstrap_actions = [
        BootstrapAction(
            name="bs1",
            path="path/to/script",
            bootstrap_action_args=["arg1", "arg2&arg3"],
        ),
        BootstrapAction(name="bs2",
                        path="path/to/anotherscript",
                        bootstrap_action_args=[]),
    ]

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(bootstrap_actions=bootstrap_actions,
                                  **run_jobflow_args)

    jf = conn.describe_jobflow(cluster_id)
    for x, y in zip(jf.bootstrapactions, bootstrap_actions):
        x.name.should.equal(y.name)
        x.path.should.equal(y.path)
        list(o.value for o in x.args).should.equal(y.args())

    resp = conn.list_bootstrap_actions(cluster_id)
    for i, y in enumerate(bootstrap_actions):
        x = resp.actions[i]
        x.name.should.equal(y.name)
        x.scriptpath.should.equal(y.path)
        list(arg.value for arg in x.args).should.equal(y.args())
Пример #5
0
def test_run_jobflow_with_visible_to_all_users():
    conn = boto.connect_emr()
    for expected in (True, False):
        job_id = conn.run_jobflow(visible_to_all_users=expected,
                                  **run_jobflow_args)
        job_flow = conn.describe_jobflow(job_id)
        job_flow.visibletoallusers.should.equal(str(expected).lower())
Пример #6
0
def test_bootstrap_actions():
    bootstrap_actions = [
        BootstrapAction(
            name='bs1',
            path='path/to/script',
            bootstrap_action_args=['arg1', 'arg2&arg3']),
        BootstrapAction(
            name='bs2',
            path='path/to/anotherscript',
            bootstrap_action_args=[])
    ]

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(
        bootstrap_actions=bootstrap_actions,
        **run_jobflow_args
    )

    jf = conn.describe_jobflow(cluster_id)
    for x, y in zip(jf.bootstrapactions, bootstrap_actions):
        x.name.should.equal(y.name)
        x.path.should.equal(y.path)
        list(o.value for o in x.args).should.equal(y.args())

    resp = conn.list_bootstrap_actions(cluster_id)
    for i, y in enumerate(bootstrap_actions):
        x = resp.actions[i]
        x.name.should.equal(y.name)
        x.scriptpath.should.equal(y.path)
        list(arg.value for arg in x.args).should.equal(y.args())
Пример #7
0
def test_create_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[step1],
    )

    instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07',
                                   '0.07')
    instance_group = conn.add_instance_groups(job_id, [instance_group])
    instance_group_id = instance_group.instancegroupids
    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(6)
    instance_group = job_flow.instancegroups[0]
    instance_group.instancegroupid.should.equal(instance_group_id)
    int(instance_group.instancerunningcount).should.equal(6)
    instance_group.instancerole.should.equal('TASK')
    instance_group.instancetype.should.equal('c1.medium')
    instance_group.market.should.equal('SPOT')
    instance_group.name.should.equal('spot-0.07')
    instance_group.bidprice.should.equal('0.07')
Пример #8
0
def test_create_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output'
    )

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[step1],
    )

    instance_group = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')
    instance_group = conn.add_instance_groups(job_id, [instance_group])
    instance_group_id = instance_group.instancegroupids
    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(6)
    instance_group = job_flow.instancegroups[0]
    instance_group.instancegroupid.should.equal(instance_group_id)
    int(instance_group.instancerunningcount).should.equal(6)
    instance_group.instancerole.should.equal('TASK')
    instance_group.instancetype.should.equal('c1.medium')
    instance_group.market.should.equal('SPOT')
    instance_group.name.should.equal('spot-0.07')
    instance_group.bidprice.should.equal('0.07')
Пример #9
0
def test_modify_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name="My wordcount example",
        mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py",
        reducer="aggregate",
        input="s3n://elasticmapreduce/samples/wordcount/input",
        output="s3n://output_bucket/output/wordcount_output",
    )

    job_id = conn.run_jobflow(name="My jobflow", log_uri="s3://some_bucket/jobflow_logs", steps=[step1])

    instance_group1 = InstanceGroup(6, "TASK", "c1.medium", "SPOT", "spot-0.07", "0.07")
    instance_group2 = InstanceGroup(6, "TASK", "c1.medium", "SPOT", "spot-0.07", "0.07")
    instance_group = conn.add_instance_groups(job_id, [instance_group1, instance_group2])
    instance_group_ids = instance_group.instancegroupids.split(",")

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(12)
    instance_group = job_flow.instancegroups[0]
    int(instance_group.instancerunningcount).should.equal(6)

    conn.modify_instance_groups(instance_group_ids, [2, 3])

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(5)
    instance_group1 = [group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[0]][0]
    int(instance_group1.instancerunningcount).should.equal(2)
    instance_group2 = [group for group in job_flow.instancegroups if group.instancegroupid == instance_group_ids[1]][0]
    int(instance_group2.instancerunningcount).should.equal(3)
Пример #10
0
def test_add_steps_to_flow():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[step1])

    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal('STARTING')
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal('My jobflow')
    job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')

    step2 = StreamingStep(
        name='My wordcount example2',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input2',
        output='s3n://output_bucket/output/wordcount_output2')

    conn.add_jobflow_steps(job_id, [step2])

    job_flow = conn.describe_jobflow(job_id)
    job_step = job_flow.steps[0]
    job_step.name.should.equal('My wordcount example')
    job_step.state.should.equal('STARTING')
    args = [arg.value for arg in job_step.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input',
        '-output',
        's3n://output_bucket/output/wordcount_output',
    ])

    job_step2 = job_flow.steps[1]
    job_step2.name.should.equal('My wordcount example2')
    job_step2.state.should.equal('PENDING')
    args = [arg.value for arg in job_step2.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input2',
        '-output',
        's3n://output_bucket/output/wordcount_output2',
    ])
Пример #11
0
def test_terminate_job_flow():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(name="My jobflow", log_uri="s3://some_bucket/jobflow_logs", steps=[])

    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("STARTING")
    conn.terminate_jobflow(job_id)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("TERMINATED")
Пример #12
0
def test_run_jobflow_with_visible_to_all_users():
    conn = boto.connect_emr()
    for expected in (True, False):
        job_id = conn.run_jobflow(
            visible_to_all_users=expected,
            **run_jobflow_args
        )
        job_flow = conn.describe_jobflow(job_id)
        job_flow.visibletoallusers.should.equal(str(expected).lower())
Пример #13
0
def test_terminate_jobflow():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(**run_jobflow_args)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("WAITING")

    conn.terminate_jobflow(job_id)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("TERMINATED")
Пример #14
0
def test_terminate_jobflow():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(**run_jobflow_args)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("WAITING")

    conn.terminate_jobflow(job_id)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal("TERMINATED")
Пример #15
0
def run_tests(things, tests):
    if len(tests) == 0:
        raise Exception("no tests")
    oldNum = len(tests)
    tests = fix_suites(tests)
    print("tests expanded from %d to %d" % (oldNum, len(tests)))

    print("things:%s\ntests:%s\n" % (things, tests))

    emr = boto.connect_emr(settings.emr_id, settings.emr_key)

    def http(path):
        return "http://%s.s3.amazonaws.com/%s" % (settings.emr_bucket, path)

    run_s3_path = "emr/%s/%s/%s/" % (os.getenv("USER"), os.getenv(
        "HOST"), datetime.datetime.today().strftime("%Y%m%d-%H%M"))

    run_s3_root = "s3n://%s/%s/" % (settings.emr_bucket, run_s3_path)

    out = run_s3_root + "out"
    logs = run_s3_root + "logs"

    jar = "s3n://%s/%s" % (settings.emr_bucket, things[2])
    step_args = [http(things[0]), http(things[1]), out, ",".join(tests)]

    step = boto.emr.step.JarStep("emr main", jar=jar, step_args=step_args)
    print("jar:%s\nargs:%s" % (jar, step_args))

    setup = boto.emr.BootstrapAction(
        "setup", "s3n://%s/%s" % (settings.emr_bucket, things[3]), [])

    jobid = emr.run_jobflow(name="Mongo EMR for %s from %s" %
                            (os.getenv("USER"), os.getenv("HOST")),
                            ec2_keyname="emr1",
                            slave_instance_type="m1.large",
                            ami_version="latest",
                            num_instances=5,
                            log_uri=logs,
                            bootstrap_actions=[setup],
                            steps=[step])

    print("%s jobid: %s" % (datetime.datetime.today(), jobid))

    while (True):
        flow = emr.describe_jobflow(jobid)
        print("%s status: %s" % (datetime.datetime.today(), flow.state))
        if flow.state == "COMPLETED" or flow.state == "FAILED":
            break
        time.sleep(30)

    syncdir = "build/emrout/" + jobid + "/"
    sync_s3(run_s3_path, syncdir)

    final_out = "build/emrout/" + jobid + "/"

    print("output in: " + final_out)
    do_output(final_out)
Пример #16
0
def run_tests( things , tests ):
    if len(tests) == 0:
        raise Exception( "no tests" )
    oldNum = len(tests)
    tests = fix_suites( tests )
    print( "tests expanded from %d to %d" % ( oldNum , len(tests) ) )
    
    print( "things:%s\ntests:%s\n" % ( things , tests ) )

    emr = boto.connect_emr( settings.emr_id , settings.emr_key )

    def http(path):
        return "http://%s.s3.amazonaws.com/%s" % ( settings.emr_bucket , path )
    
    run_s3_path = "emr/%s/%s/%s/" % ( os.getenv( "USER" ) , 
                                      os.getenv( "HOST" ) , 
                                      datetime.datetime.today().strftime( "%Y%m%d-%H%M" ) )

    run_s3_root = "s3n://%s/%s/" % ( settings.emr_bucket , run_s3_path )

    out = run_s3_root + "out"
    logs = run_s3_root + "logs"

    jar="s3n://%s/%s" % ( settings.emr_bucket , things[2] )
    step_args=[ http(things[0]) , http(things[1]) , out , ",".join(tests) ]
    
    step = boto.emr.step.JarStep( "emr main" , jar=jar,step_args=step_args )
    print( "jar:%s\nargs:%s" % ( jar , step_args ) )

    setup = boto.emr.BootstrapAction( "setup" , "s3n://%s/%s" % ( settings.emr_bucket , things[3] ) , []  )

    jobid = emr.run_jobflow( name = "Mongo EMR for %s from %s" % ( os.getenv( "USER" ) , os.getenv( "HOST" ) ) ,
                             ec2_keyname = "emr1" , 
                             slave_instance_type = "m1.large" ,
                             ami_version = "latest" ,
                             num_instances=5 ,
                             log_uri = logs ,
                             bootstrap_actions = [ setup ] , 
                             steps = [ step ] )

    
    print( "%s jobid: %s" % ( datetime.datetime.today() , jobid ) )

    while ( True ):
        flow = emr.describe_jobflow( jobid )
        print( "%s status: %s" % ( datetime.datetime.today() , flow.state ) )
        if flow.state == "COMPLETED" or flow.state == "FAILED":
            break
        time.sleep(30)

    syncdir = "build/emrout/" + jobid + "/"
    sync_s3( run_s3_path , syncdir )
    
    final_out = "build/emrout/" + jobid + "/" 
    
    print("output in: " + final_out )
    do_output( final_out )
Пример #17
0
def test_terminate_job_flow():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[])

    flow = conn.describe_jobflows()[0]
    flow.state.should.equal('STARTING')
    conn.terminate_jobflow(job_id)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal('TERMINATED')
Пример #18
0
def test_create_job_flow_visible_to_all_users():
    conn = boto.connect_emr()

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
        visible_to_all_users=True,
    )
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal('True')
Пример #19
0
def test_create_job_flow_visible_to_all_users():
    conn = boto.connect_emr()

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
        visible_to_all_users=True,
    )
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal('True')
Пример #20
0
def test_create_job_flow_visible_to_all_users():
    conn = boto.connect_emr()

    job_id = conn.run_jobflow(
        name="My jobflow",
        log_uri="s3://some_bucket/jobflow_logs",
        job_flow_role="some-role-arn",
        steps=[],
        visible_to_all_users=True,
    )
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("True")
Пример #21
0
def test_create_job_flow_with_new_params():
    # Test that run_jobflow works with newer params
    conn = boto.connect_emr()

    conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        master_instance_type='m1.medium',
        slave_instance_type='m1.small',
        job_flow_role='some-role-arn',
        steps=[],
    )
Пример #22
0
def test_describe_cluster():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
    )

    cluster = conn.describe_cluster(job_id)
    cluster.name.should.equal("My jobflow")
    cluster.normalizedinstancehours.should.equal('0')
    cluster.status.state.should.equal("RUNNING")
Пример #23
0
def test_create_job_flow_with_new_params():
    # Test that run_jobflow works with newer params
    conn = boto.connect_emr()

    conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        master_instance_type='m1.medium',
        slave_instance_type='m1.small',
        job_flow_role='some-role-arn',
        steps=[],
    )
Пример #24
0
def test_describe_cluster():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
    )

    cluster = conn.describe_cluster(job_id)
    cluster.name.should.equal("My jobflow")
    cluster.normalizedinstancehours.should.equal('0')
    cluster.status.state.should.equal("RUNNING")
Пример #25
0
def test_terminate_job_flow():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[]
    )

    flow = conn.describe_jobflows()[0]
    flow.state.should.equal('STARTING')
    conn.terminate_jobflow(job_id)
    flow = conn.describe_jobflows()[0]
    flow.state.should.equal('TERMINATED')
Пример #26
0
def test_set_termination_protection():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(**run_jobflow_args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("false")

    conn.set_termination_protection(job_id, True)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("true")

    conn.set_termination_protection(job_id, False)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("false")
Пример #27
0
def step_completed(emrid):
	"""Check if EMR job with jobflow id emrid has completed."""
	emr = boto.connect_emr()
	job = emr.describe_jobflow(emrid)
	step = job.steps[-1]
	emr.close()
	
	# print >> sys.stderr, step.state
	
	if step.state == "COMPLETED":
		return True
	else:
		return False
Пример #28
0
def test_set_termination_protection():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(**run_jobflow_args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("false")

    conn.set_termination_protection(job_id, True)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("true")

    conn.set_termination_protection(job_id, False)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal("false")
Пример #29
0
def test_run_jobflow():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    job_id = conn.run_jobflow(**args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal("WAITING")
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal(args["name"])
    job_flow.masterinstancetype.should.equal(args["master_instance_type"])
    job_flow.slaveinstancetype.should.equal(args["slave_instance_type"])
    job_flow.loguri.should.equal(args["log_uri"])
    job_flow.visibletoallusers.should.equal("false")
    int(job_flow.normalizedinstancehours).should.equal(0)
    job_flow.steps.should.have.length_of(0)
Пример #30
0
  def dumpEMRClusters(self):
    '''
      Method to dump EMR clusters info.
    '''

    try:
      if self.botoprfl[0] != "default":
        conn = boto.connect_emr(profile_name = self.botoprfl)
      else:
        conn = boto.connect_emr()
      if conn:
        print("\n<Start of EMR clusters>\n")
        print(" Jobflows: %s" %conn.describe_jobflows())
        self.opygenericroutines.prntLogErrWarnInfo('', 'info', bresume = True)
        for c in conn.list_clusters().clusters:
          ec = " %s" %c
          self.opygenericroutines.prntLogErrWarnInfo(str(ec), 'info', bresume = True)
        self.opygenericroutines.prntLogErrWarnInfo('', 'info', bresume = True)
        print("\n<End of EMR clusters>\n")
    except Exception, e:
      serr = ('%s :: dumpEMRClusters(...) : connect_emr,list_clusters(...).clusters, '
              '%s' %(self.sclsnme, str(e)))
      prntErrWarnInfo(serr, bresume = True)
Пример #31
0
def test_describe_jobflows():
    conn = boto.connect_emr()
    job1_id = conn.run_jobflow(**run_jobflow_args)
    job2_id = conn.run_jobflow(**run_jobflow_args)

    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(2)

    jobs = conn.describe_jobflows(jobflow_ids=[job2_id])
    jobs.should.have.length_of(1)
    jobs[0].jobflowid.should.equal(job2_id)

    first_job = conn.describe_jobflow(job1_id)
    first_job.jobflowid.should.equal(job1_id)
Пример #32
0
def get_jobflow_status(emr_id):
	"""Get the EMR jobflow state for EMR jobflow id emr_id."""
	conn = boto.connect_emr()

	jobflow = conn.describe_jobflow(emr_id)
	status = jobflow.state
	try:
		details = jobflow.laststatechangereason
		url = "http://%s:9100" % jobflow.masterpublicdnsname
	except AttributeError:
		details = ""
		url = ""

	return status, details, url
Пример #33
0
def test_tags():
    input_tags = {"tag1": "val1", "tag2": "val2"}

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(**run_jobflow_args)

    conn.add_tags(cluster_id, input_tags)
    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(2)
    dict((t.key, t.value) for t in cluster.tags).should.equal(input_tags)

    conn.remove_tags(cluster_id, list(input_tags.keys()))
    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(0)
Пример #34
0
def test_run_jobflow():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    job_id = conn.run_jobflow(**args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal("WAITING")
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal(args["name"])
    job_flow.masterinstancetype.should.equal(args["master_instance_type"])
    job_flow.slaveinstancetype.should.equal(args["slave_instance_type"])
    job_flow.loguri.should.equal(args["log_uri"])
    job_flow.visibletoallusers.should.equal("false")
    int(job_flow.normalizedinstancehours).should.equal(0)
    job_flow.steps.should.have.length_of(0)
Пример #35
0
def test_tags():
    input_tags = {"tag1": "val1", "tag2": "val2"}

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(**run_jobflow_args)

    conn.add_tags(cluster_id, input_tags)
    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(2)
    dict((t.key, t.value) for t in cluster.tags).should.equal(input_tags)

    conn.remove_tags(cluster_id, list(input_tags.keys()))
    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(0)
Пример #36
0
def test_describe_jobflows():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    expected = {}

    for idx in range(4):
        cluster_name = "cluster" + str(idx)
        args["name"] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        expected[cluster_id] = {
            "id": cluster_id,
            "name": cluster_name,
            "state": "WAITING",
        }

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(pytz.utc)
    time.sleep(1)

    for idx in range(4, 6):
        cluster_name = "cluster" + str(idx)
        args["name"] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        conn.terminate_jobflow(cluster_id)
        expected[cluster_id] = {
            "id": cluster_id,
            "name": cluster_name,
            "state": "TERMINATED",
        }
    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(6)

    for cluster_id in expected:
        resp = conn.describe_jobflows(jobflow_ids=[cluster_id])
        resp.should.have.length_of(1)
        resp[0].jobflowid.should.equal(cluster_id)

    resp = conn.describe_jobflows(states=["WAITING"])
    resp.should.have.length_of(4)
    for x in resp:
        x.state.should.equal("WAITING")

    resp = conn.describe_jobflows(created_before=timestamp)
    resp.should.have.length_of(4)

    resp = conn.describe_jobflows(created_after=timestamp)
    resp.should.have.length_of(2)
Пример #37
0
def test_describe_jobflows():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    expected = {}

    for idx in range(4):
        cluster_name = 'cluster' + str(idx)
        args['name'] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        expected[cluster_id] = {
            'id': cluster_id,
            'name': cluster_name,
            'state': 'WAITING'
        }

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(pytz.utc)
    time.sleep(1)

    for idx in range(4, 6):
        cluster_name = 'cluster' + str(idx)
        args['name'] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        conn.terminate_jobflow(cluster_id)
        expected[cluster_id] = {
            'id': cluster_id,
            'name': cluster_name,
            'state': 'TERMINATED'
        }
    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(6)

    for cluster_id, y in expected.items():
        resp = conn.describe_jobflows(jobflow_ids=[cluster_id])
        resp.should.have.length_of(1)
        resp[0].jobflowid.should.equal(cluster_id)

    resp = conn.describe_jobflows(states=['WAITING'])
    resp.should.have.length_of(4)
    for x in resp:
        x.state.should.equal('WAITING')

    resp = conn.describe_jobflows(created_before=timestamp)
    resp.should.have.length_of(4)

    resp = conn.describe_jobflows(created_after=timestamp)
    resp.should.have.length_of(2)
Пример #38
0
def test_describe_jobflows():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    expected = {}

    for idx in range(4):
        cluster_name = 'cluster' + str(idx)
        args['name'] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        expected[cluster_id] = {
            'id': cluster_id,
            'name': cluster_name,
            'state': 'WAITING'
        }

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(pytz.utc)
    time.sleep(1)

    for idx in range(4, 6):
        cluster_name = 'cluster' + str(idx)
        args['name'] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        conn.terminate_jobflow(cluster_id)
        expected[cluster_id] = {
            'id': cluster_id,
            'name': cluster_name,
            'state': 'TERMINATED'
        }
    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(6)

    for cluster_id, y in expected.items():
        resp = conn.describe_jobflows(jobflow_ids=[cluster_id])
        resp.should.have.length_of(1)
        resp[0].jobflowid.should.equal(cluster_id)

    resp = conn.describe_jobflows(states=['WAITING'])
    resp.should.have.length_of(4)
    for x in resp:
        x.state.should.equal('WAITING')

    resp = conn.describe_jobflows(created_before=timestamp)
    resp.should.have.length_of(4)

    resp = conn.describe_jobflows(created_after=timestamp)
    resp.should.have.length_of(2)
Пример #39
0
def test_run_jobflow_with_instance_groups():
    input_groups = dict((g.name, g) for g in input_instance_groups)
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(instance_groups=input_instance_groups, **run_jobflow_args)
    job_flow = conn.describe_jobflow(job_id)
    int(job_flow.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
    for instance_group in job_flow.instancegroups:
        expected = input_groups[instance_group.name]
        instance_group.should.have.property("instancegroupid")
        int(instance_group.instancerunningcount).should.equal(expected.num_instances)
        instance_group.instancerole.should.equal(expected.role)
        instance_group.instancetype.should.equal(expected.type)
        instance_group.market.should.equal(expected.market)
        if hasattr(expected, "bidprice"):
            instance_group.bidprice.should.equal(expected.bidprice)
Пример #40
0
def test_set_visible_to_all_users():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    args["visible_to_all_users"] = False
    job_id = conn.run_jobflow(**args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("false")

    conn.set_visible_to_all_users(job_id, True)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("true")

    conn.set_visible_to_all_users(job_id, False)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("false")
Пример #41
0
def test_list_clusters():
    conn = boto.connect_emr()
    conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
    )

    summary = conn.list_clusters()
    clusters = summary.clusters
    clusters.should.have.length_of(1)
    cluster = clusters[0]
    cluster.name.should.equal("My jobflow")
    cluster.normalizedinstancehours.should.equal('0')
    cluster.status.state.should.equal("RUNNING")
Пример #42
0
def test_list_clusters():
    conn = boto.connect_emr()
    conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
    )

    summary = conn.list_clusters()
    clusters = summary.clusters
    clusters.should.have.length_of(1)
    cluster = clusters[0]
    cluster.name.should.equal("My jobflow")
    cluster.normalizedinstancehours.should.equal('0')
    cluster.status.state.should.equal("RUNNING")
Пример #43
0
def create_job_flow(steps, job):
	"""Start EMR job"""
	conn = boto.connect_emr()
	
	job_flows = conn.describe_jobflows(['WAITING'])
	for jf in job_flows:
		if int(jf.instancecount) >= int(job.nodes):
			conn.add_jobflow_steps(jf.jobflowid, steps)
			jobid = jf.jobflowid
			break
	else:
		jobid = conn.run_jobflow("nsr web jobflow", log_uri="s3n://nsr-logs", master_instance_type=str(job.node_size), slave_instance_type=str(job.node_size), num_instances=job.nodes, action_on_failure="CONTINUE", steps=steps, keep_alive=True)
		
	conn.close()
	return jobid
Пример #44
0
def test_set_visible_to_all_users():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    args["visible_to_all_users"] = False
    job_id = conn.run_jobflow(**args)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("false")

    conn.set_visible_to_all_users(job_id, True)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("true")

    conn.set_visible_to_all_users(job_id, False)
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal("false")
Пример #45
0
def test_run_jobflow_with_instance_groups():
    input_groups = dict((g.name, g) for g in input_instance_groups)
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(instance_groups=input_instance_groups,
                              **run_jobflow_args)
    job_flow = conn.describe_jobflow(job_id)
    int(job_flow.instancecount).should.equal(sum(g.num_instances for g in input_instance_groups))
    for instance_group in job_flow.instancegroups:
        expected = input_groups[instance_group.name]
        instance_group.should.have.property('instancegroupid')
        int(instance_group.instancerunningcount).should.equal(expected.num_instances)
        instance_group.instancerole.should.equal(expected.role)
        instance_group.instancetype.should.equal(expected.type)
        instance_group.market.should.equal(expected.market)
        if hasattr(expected, 'bidprice'):
            instance_group.bidprice.should.equal(expected.bidprice)
Пример #46
0
def test_create_job_flow_with_instance_groups():
    conn = boto.connect_emr()

    instance_groups = [InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07'),
                       InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT', 'spot-0.07', '0.07')]
    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
        instance_groups=instance_groups
    )

    job_flow = conn.describe_jobflow(job_id)
    int(job_flow.instancecount).should.equal(12)
    instance_group = job_flow.instancegroups[0]
    int(instance_group.instancerunningcount).should.equal(6)
Пример #47
0
def test_describe_job_flows():
    conn = boto.connect_emr()
    job1_id = conn.run_jobflow(name='My jobflow',
                               log_uri='s3://some_bucket/jobflow_logs',
                               steps=[])
    job2_id = conn.run_jobflow(name='My jobflow',
                               log_uri='s3://some_bucket/jobflow_logs',
                               steps=[])

    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(2)

    jobs = conn.describe_jobflows(jobflow_ids=[job2_id])
    jobs.should.have.length_of(1)
    jobs[0].jobflowid.should.equal(job2_id)

    first_job = conn.describe_jobflow(job1_id)
    first_job.jobflowid.should.equal(job1_id)
Пример #48
0
def cancel_job(job):
	"""Cancel job job by terminating the EMR job flow or killing the single machine process."""
	if job.job_type == 'e':
		c = boto.connect_emr()
		c.terminate_jobflow(job.jobflowid)
	else:
		filename = job.get_input_file().name.split('/')[-1]		
		kill_cmd = "pkill -f %s" % filename

		client = paramiko.SSHClient()
		client.load_host_keys('/var/www/known_hosts')
		client.connect('10.203.87.100', 22, 'ec2-user', key_filename='/var/www/nsr-dev.pem')
		stdin, stdout, stderr = client.exec_command(kill_cmd)
		# for line in stdout:
		# 	print line
		# for line in stderr:
		# 	print line
		client.exec_command("echo CANCELLED > ~/status-output/status-s-%s.log" % filename)		
Пример #49
0
def get_step_status(emrid):	
	"""Get percentage complete of EMR job with jobflow id emrid.

	This screen scrapes the EMR tracker page, which is available at the job's masterpublicdnsname on port 9100. Accessing to this page is limited to whitelisted IPs, which can be set in the AWS Security Group settings page.
	"""
	emr = boto.connect_emr()
	job = emr.describe_jobflow(emrid)
	url = job.masterpublicdnsname	
	emr.close()
	
	c = httplib.HTTPConnection(url, 9100)
	c.request("GET", "/jobtracker.jsp")
	response = c.getresponse().read().split("\n")
	status_line = response[36]
	
	statuses = map(float, re.findall("<td>([0-9.]*)%<table", status_line))
	# print >> sys.stderr, statuses
	return sum(statuses)/200. * 90
Пример #50
0
def test_set_termination_protection():
    conn = boto.connect_emr()

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[]
    )
    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal(u'None')

    conn.set_termination_protection(job_id, True)

    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal('true')

    conn.set_termination_protection(job_id, False)

    job_flow = conn.describe_jobflow(job_id)
    job_flow.terminationprotected.should.equal('false')
Пример #51
0
def test_modify_instance_groups():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    job_id = conn.run_jobflow(name='My jobflow',
                              log_uri='s3://some_bucket/jobflow_logs',
                              steps=[step1])

    instance_group1 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group2 = InstanceGroup(6, 'TASK', 'c1.medium', 'SPOT',
                                    'spot-0.07', '0.07')
    instance_group = conn.add_instance_groups(
        job_id, [instance_group1, instance_group2])
    instance_group_ids = instance_group.instancegroupids.split(",")

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(12)
    instance_group = job_flow.instancegroups[0]
    int(instance_group.instancerunningcount).should.equal(6)

    conn.modify_instance_groups(instance_group_ids, [2, 3])

    job_flow = conn.describe_jobflows()[0]
    int(job_flow.instancecount).should.equal(5)
    instance_group1 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[0]
    ][0]
    int(instance_group1.instancerunningcount).should.equal(2)
    instance_group2 = [
        group for group in job_flow.instancegroups
        if group.instancegroupid == instance_group_ids[1]
    ][0]
    int(instance_group2.instancerunningcount).should.equal(3)
Пример #52
0
def test_describe_jobflows():
    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    expected = {}

    for idx in range(400):
        cluster_name = "cluster" + str(idx)
        args["name"] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        expected[cluster_id] = {"id": cluster_id, "name": cluster_name, "state": "WAITING"}

    # need sleep since it appears the timestamp is always rounded to
    # the nearest second internally
    time.sleep(1)
    timestamp = datetime.now(pytz.utc)
    time.sleep(1)

    for idx in range(400, 600):
        cluster_name = "cluster" + str(idx)
        args["name"] = cluster_name
        cluster_id = conn.run_jobflow(**args)
        conn.terminate_jobflow(cluster_id)
        expected[cluster_id] = {"id": cluster_id, "name": cluster_name, "state": "TERMINATED"}
    jobs = conn.describe_jobflows()
    jobs.should.have.length_of(512)

    for cluster_id, y in expected.items():
        resp = conn.describe_jobflows(jobflow_ids=[cluster_id])
        resp.should.have.length_of(1)
        resp[0].jobflowid.should.equal(cluster_id)

    resp = conn.describe_jobflows(states=["WAITING"])
    resp.should.have.length_of(400)
    for x in resp:
        x.state.should.equal("WAITING")

    resp = conn.describe_jobflows(created_before=timestamp)
    resp.should.have.length_of(400)

    resp = conn.describe_jobflows(created_after=timestamp)
    resp.should.have.length_of(200)
Пример #53
0
def test_set_visible_to_all_users():
    conn = boto.connect_emr()

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        job_flow_role='some-role-arn',
        steps=[],
        visible_to_all_users=False,
    )
    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal('False')

    conn.set_visible_to_all_users(job_id, True)

    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal('True')

    conn.set_visible_to_all_users(job_id, False)

    job_flow = conn.describe_jobflow(job_id)
    job_flow.visibletoallusers.should.equal('False')
Пример #54
0
def test_list_clusters():
    conn = boto.connect_emr()

    args = run_jobflow_args.copy()
    args['name'] = 'jobflow1'
    cluster1_id = conn.run_jobflow(**args)
    args['name'] = 'jobflow2'
    cluster2_id = conn.run_jobflow(**args)
    conn.terminate_jobflow(cluster2_id)

    summary = conn.list_clusters()
    clusters = summary.clusters
    clusters.should.have.length_of(2)

    expected = {
        cluster1_id: {
            'id': cluster1_id,
            'name': 'jobflow1',
            'normalizedinstancehours': 0,
            'state': 'WAITING'},
        cluster2_id: {
            'id': cluster2_id,
            'name': 'jobflow2',
            'normalizedinstancehours': 0,
            'state': 'TERMINATED'},
    }

    for x in clusters:
        y = expected[x.id]
        x.id.should.equal(y['id'])
        x.name.should.equal(y['name'])
        int(x.normalizedinstancehours).should.equal(y['normalizedinstancehours'])
        x.status.state.should.equal(y['state'])
        x.status.timeline.creationdatetime.should.be.a(six.string_types)
        if y['state'] == 'TERMINATED':
            x.status.timeline.enddatetime.should.be.a(six.string_types)
        else:
            x.status.timeline.shouldnt.have.property('enddatetime')
        x.status.timeline.readydatetime.should.be.a(six.string_types)
Пример #55
0
def test_cluster_tagging():
    conn = boto.connect_emr()
    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        steps=[],
    )
    cluster_id = job_id
    conn.add_tags(cluster_id, {"tag1": "val1", "tag2": "val2"})

    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(2)
    tags = dict((tag.key, tag.value) for tag in cluster.tags)
    tags['tag1'].should.equal('val1')
    tags['tag2'].should.equal('val2')

    # Remove a tag
    conn.remove_tags(cluster_id, ["tag1"])
    cluster = conn.describe_cluster(cluster_id)
    cluster.tags.should.have.length_of(1)
    tags = dict((tag.key, tag.value) for tag in cluster.tags)
    tags['tag2'].should.equal('val2')
Пример #56
0
	if o in ('--spot-bid'):
		params['spot_bid_price']=a
	if o in ('--test'):
		params['test_mode']=True
	
required = ['aws_key','secret','keypair']

for pname in required:
    if not params.get(pname, None):
        print '\nERROR:%s is required' % pname
        usage()

for p, v in params.iteritems():
	print "param:" + `p`+ " value:" + `v`

conn = boto.connect_emr(params['aws_key'],params['secret'])

bootstrap_step1 = BootstrapAction("install_cc", "s3://commoncrawl-public/config64.sh",[params['aws_key'], params['secret']])
bootstrap_step2 = BootstrapAction("configure_hadoop", "s3://elasticmapreduce/bootstrap-actions/configure-hadoop",
	[
	"-m","mapred.tasktracker.map.tasks.maximum=8",
	"-m","mapred.child.java.opts=-XX:ErrorFile=/tmp/hs_err_${mapred.tip.id}.log -Xmx700m -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=100m -XX:+UseConcMarkSweepGC -XX:+UseTLAB -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:CMSIncrementalDutyCycle=10"
	])
bootstrap_step3 = BootstrapAction("configure_jobtrackerheap", "s3://elasticmapreduce/bootstrap-actions/configure-daemons",["--jobtracker-heap-size=12096"])

namenode_instance_group = InstanceGroup(1,"MASTER","c1.xlarge","ON_DEMAND","MASTER_GROUP")
core_instance_group = InstanceGroup(params['num_core'],"CORE","c1.xlarge","ON_DEMAND","CORE_GROUP")

instance_groups=[]
if params['num_spot'] <= 0:
	instance_groups=[namenode_instance_group,core_instance_group]
Пример #57
0
def test_create_job_flow():
    conn = boto.connect_emr()

    step1 = StreamingStep(
        name='My wordcount example',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input',
        output='s3n://output_bucket/output/wordcount_output')

    step2 = StreamingStep(
        name='My wordcount example2',
        mapper='s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        reducer='aggregate',
        input='s3n://elasticmapreduce/samples/wordcount/input2',
        output='s3n://output_bucket/output/wordcount_output2')

    job_id = conn.run_jobflow(
        name='My jobflow',
        log_uri='s3://some_bucket/jobflow_logs',
        master_instance_type='m1.medium',
        slave_instance_type='m1.small',
        steps=[step1, step2],
    )

    job_flow = conn.describe_jobflow(job_id)
    job_flow.state.should.equal('STARTING')
    job_flow.jobflowid.should.equal(job_id)
    job_flow.name.should.equal('My jobflow')
    job_flow.masterinstancetype.should.equal('m1.medium')
    job_flow.slaveinstancetype.should.equal('m1.small')
    job_flow.loguri.should.equal('s3://some_bucket/jobflow_logs')
    job_flow.visibletoallusers.should.equal('False')
    int(job_flow.normalizedinstancehours).should.equal(0)
    job_step = job_flow.steps[0]
    job_step.name.should.equal('My wordcount example')
    job_step.state.should.equal('STARTING')
    args = [arg.value for arg in job_step.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input',
        '-output',
        's3n://output_bucket/output/wordcount_output',
    ])

    job_step2 = job_flow.steps[1]
    job_step2.name.should.equal('My wordcount example2')
    job_step2.state.should.equal('PENDING')
    args = [arg.value for arg in job_step2.args]
    args.should.equal([
        '-mapper',
        's3n://elasticmapreduce/samples/wordcount/wordSplitter2.py',
        '-reducer',
        'aggregate',
        '-input',
        's3n://elasticmapreduce/samples/wordcount/input2',
        '-output',
        's3n://output_bucket/output/wordcount_output2',
    ])
Пример #58
0
def test_instance_groups():
    input_groups = dict((g.name, g) for g in input_instance_groups)

    conn = boto.connect_emr()
    args = run_jobflow_args.copy()
    for key in [
            "master_instance_type", "slave_instance_type", "num_instances"
    ]:
        del args[key]
    args["instance_groups"] = input_instance_groups[:2]
    job_id = conn.run_jobflow(**args)

    jf = conn.describe_jobflow(job_id)
    base_instance_count = int(jf.instancecount)

    conn.add_instance_groups(job_id, input_instance_groups[2:])

    jf = conn.describe_jobflow(job_id)
    int(jf.instancecount).should.equal(
        sum(g.num_instances for g in input_instance_groups))
    for x in jf.instancegroups:
        y = input_groups[x.name]
        if hasattr(y, "bidprice"):
            x.bidprice.should.equal(y.bidprice)
        x.creationdatetime.should.be.a(str)
        # x.enddatetime.should.be.a(str)
        x.should.have.property("instancegroupid")
        int(x.instancerequestcount).should.equal(y.num_instances)
        x.instancerole.should.equal(y.role)
        int(x.instancerunningcount).should.equal(y.num_instances)
        x.instancetype.should.equal(y.type)
        x.laststatechangereason.should.be.a(str)
        x.market.should.equal(y.market)
        x.name.should.be.a(str)
        x.readydatetime.should.be.a(str)
        x.startdatetime.should.be.a(str)
        x.state.should.equal("RUNNING")

    for x in conn.list_instance_groups(job_id).instancegroups:
        y = input_groups[x.name]
        if hasattr(y, "bidprice"):
            x.bidprice.should.equal(y.bidprice)
        # Configurations
        # EbsBlockDevices
        # EbsOptimized
        x.should.have.property("id")
        x.instancegrouptype.should.equal(y.role)
        x.instancetype.should.equal(y.type)
        x.market.should.equal(y.market)
        x.name.should.equal(y.name)
        int(x.requestedinstancecount).should.equal(y.num_instances)
        int(x.runninginstancecount).should.equal(y.num_instances)
        # ShrinkPolicy
        x.status.state.should.equal("RUNNING")
        x.status.statechangereason.code.should.be.a(str)
        x.status.statechangereason.message.should.be.a(str)
        x.status.timeline.creationdatetime.should.be.a(str)
        # x.status.timeline.enddatetime.should.be.a(str)
        x.status.timeline.readydatetime.should.be.a(str)

    igs = dict((g.name, g) for g in jf.instancegroups)

    conn.modify_instance_groups(
        [igs["task-1"].instancegroupid, igs["task-2"].instancegroupid], [2, 3])
    jf = conn.describe_jobflow(job_id)
    int(jf.instancecount).should.equal(base_instance_count + 5)
    igs = dict((g.name, g) for g in jf.instancegroups)
    int(igs["task-1"].instancerunningcount).should.equal(2)
    int(igs["task-2"].instancerunningcount).should.equal(3)
Пример #59
0
import datetime
import os

import boto
from boto.emr.instance_group import InstanceGroup
from boto.emr.step import InstallPigStep, PigStep


conn = boto.connect_emr()

instance_groups = [
    InstanceGroup(1, 'MASTER', 'm1.small', 'SPOT', '[email protected]', '0.10'),
    InstanceGroup(2, 'CORE', 'm1.small', 'SPOT', '[email protected]', '0.10'),
]

pig_file = 's3://elasticmapreduce/samples/pig-apache/do-reports2.pig'
INPUT = 's3://elasticmapreduce/samples/pig-apache/input/'
OUTPUT = ('s3://org.unencrypted.emr.output/apache_sample/%s' %
          datetime.datetime.utcnow().strftime("%s"))

print """\
Running pig job with settings:

    SCRIPT={script}
    INPUT={input}
    OUPUT={output}
""".format(script=pig_file, input=INPUT, output=OUTPUT)

pig_args = ['-p', 'INPUT=%s' % INPUT,
            '-p', 'OUTPUT=%s' % OUTPUT]
Пример #60
0
def test_steps():
    input_steps = [
        StreamingStep(
            name="My wordcount example",
            mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter.py",
            reducer="aggregate",
            input="s3n://elasticmapreduce/samples/wordcount/input",
            output="s3n://output_bucket/output/wordcount_output",
        ),
        StreamingStep(
            name="My wordcount example & co.",
            mapper="s3n://elasticmapreduce/samples/wordcount/wordSplitter2.py",
            reducer="aggregate",
            input="s3n://elasticmapreduce/samples/wordcount/input2",
            output="s3n://output_bucket/output/wordcount_output2",
        ),
    ]

    # TODO: implementation and test for cancel_steps

    conn = boto.connect_emr()
    cluster_id = conn.run_jobflow(steps=[input_steps[0]], **run_jobflow_args)

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(1)

    conn.add_jobflow_steps(cluster_id, [input_steps[1]])

    jf = conn.describe_jobflow(cluster_id)
    jf.steps.should.have.length_of(2)
    for step in jf.steps:
        step.actiononfailure.should.equal("TERMINATE_JOB_FLOW")
        list(arg.value for arg in step.args).should.have.length_of(8)
        step.creationdatetime.should.be.a(str)
        # step.enddatetime.should.be.a(str)
        step.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        step.laststatechangereason.should.be.a(str)
        step.mainclass.should.equal("")
        step.name.should.be.a(str)
        # step.readydatetime.should.be.a(str)
        # step.startdatetime.should.be.a(str)
        step.state.should.be.within(["RUNNING", "PENDING"])

    expected = dict((s.name, s) for s in input_steps)

    steps = conn.list_steps(cluster_id).steps
    for x in steps:
        y = expected[x.name]
        # actiononfailure
        list(arg.value for arg in x.config.args).should.equal([
            "-mapper",
            y.mapper,
            "-reducer",
            y.reducer,
            "-input",
            y.input,
            "-output",
            y.output,
        ])
        x.config.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        x.config.mainclass.should.equal("")
        # properties
        x.should.have.property("id").should.be.a(str)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(["RUNNING", "PENDING"])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(str)
        # x.status.timeline.enddatetime.should.be.a(str)
        # x.status.timeline.startdatetime.should.be.a(str)

        x = conn.describe_step(cluster_id, x.id)
        list(arg.value for arg in x.config.args).should.equal([
            "-mapper",
            y.mapper,
            "-reducer",
            y.reducer,
            "-input",
            y.input,
            "-output",
            y.output,
        ])
        x.config.jar.should.equal(
            "/home/hadoop/contrib/streaming/hadoop-streaming.jar")
        x.config.mainclass.should.equal("")
        # properties
        x.should.have.property("id").should.be.a(str)
        x.name.should.equal(y.name)
        x.status.state.should.be.within(["RUNNING", "PENDING"])
        # x.status.statechangereason
        x.status.timeline.creationdatetime.should.be.a(str)
        # x.status.timeline.enddatetime.should.be.a(str)
        # x.status.timeline.startdatetime.should.be.a(str)

    @requires_boto_gte("2.39")
    def test_list_steps_with_states():
        # boto's list_steps prior to 2.39 has a bug that ignores
        # step_states argument.
        steps = conn.list_steps(cluster_id).steps
        step_id = steps[0].id
        steps = conn.list_steps(cluster_id, step_states=["RUNNING"]).steps
        steps.should.have.length_of(1)
        steps[0].id.should.equal(step_id)

    test_list_steps_with_states()