示例#1
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
示例#2
0
    def test_create_scratch_uri(self):
        # "walrus" bucket will be ignored; it doesn't start with "mrjob-"
        self.add_mock_s3_data({'walrus': {}, 'zebra': {}})

        runner = EMRJobRunner(conf_path=False, s3_sync_wait_time=0.01)

        # bucket name should be mrjob- plus 16 random hex digits
        s3_scratch_uri = runner._opts['s3_scratch_uri']
        assert_equal(s3_scratch_uri[:11], 's3://mrjob-')
        assert_equal(s3_scratch_uri[27:], '/tmp/')

        # bucket shouldn't actually exist yet
        scratch_bucket, _ = parse_s3_uri(s3_scratch_uri)
        assert_not_in(scratch_bucket, self.mock_s3_fs.keys())

        # need to do something to ensure that the bucket actually gets
        # created. let's launch a (mock) job flow
        jfid = runner.make_persistent_job_flow()
        assert_in(scratch_bucket, self.mock_s3_fs.keys())
        runner.make_emr_conn().terminate_jobflow(jfid)

        # once our scratch bucket is created, we should re-use it
        runner2 = EMRJobRunner(conf_path=False)
        assert_equal(runner2._opts['s3_scratch_uri'], s3_scratch_uri)
        s3_scratch_uri = runner._opts['s3_scratch_uri']
示例#3
0
 def test_no_region(self):
     runner = EMRJobRunner(conf_path=False)
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
示例#4
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if len(args) != 1:
        option_parser.error('takes exactly one argument')
    emr_job_flow_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(conf_path=options.conf_path)
    log.debug('Terminating job flow %s' % emr_job_flow_id)
    runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
    log.info('Terminated job flow %s' % emr_job_flow_id)
示例#5
0
文件: emr_test.py 项目: hblanks/mrjob
    def test_local_bootstrap_action(self):
        # make sure that local bootstrap action scripts get uploaded to S3
        action_path = os.path.join(self.tmp_dir, 'apt-install.sh')
        with open(action_path, 'w') as f:
            f.write('for $pkg in $@; do sudo apt-get install $pkg; done\n')

        bootstrap_actions = [
            action_path + ' python-scipy mysql-server']

        runner = EMRJobRunner(conf_path=False,
                              bootstrap_actions=bootstrap_actions,
                              s3_sync_wait_time=0.01)

        job_flow_id = runner.make_persistent_job_flow()

        emr_conn = runner.make_emr_conn()
        job_flow = emr_conn.describe_jobflow(job_flow_id)
        actions = job_flow.bootstrapactions

        assert_equal(len(actions), 2)

        assert actions[0].path.startswith('s3://mrjob-')
        assert actions[0].path.endswith('/apt-install.sh')
        assert_equal(actions[0].name, 'apt-install.sh')
        assert_equal(actions[0].args, ['python-scipy', 'mysql-server'])

        # check for master boostrap script
        assert actions[1].path.startswith('s3://mrjob-')
        assert actions[1].path.endswith('b.py')
        assert_equal(actions[1].args, [])
        assert_equal(actions[1].name, 'master')

        # make sure master bootstrap script is on S3
        assert runner.path_exists(actions[1].path)
示例#6
0
def main(cl_args=None):
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    cluster_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(**runner_kwargs(options))
    log.debug('Terminating job flow %s' % cluster_id)
    runner.make_emr_conn().terminate_jobflow(cluster_id)
    log.info('Terminated job flow %s' % cluster_id)
示例#7
0
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
    # print (aws_access_key_id,aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    d = {'WAITING':0,'STARTING':1,'RUNNING':2}
    waiting_flows=[]
    for flow in job_flows:
        try:
            if flow.state in d.keys():
                job_id=flow.jobflowid
                ip_address=flow.masterpublicdnsname
                waiting_flows.append([d[flow.state],job_id,ip_address,flow.state])
                if ssh_key_pair_file != '':
                    print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address)
                    job_id=flow.jobflowid
        except Exception:
            continue
    waiting_flows = sorted(waiting_flows, key=itemgetter(0))
    waiting_flows = [i[1:] for i in waiting_flows] #An index was added at the beginning for the sorting. Removing that index in this step
    waiting_flows_dict = [{'flow_id':i[0],'node':i[1],'flow_state':i[2]} for i in waiting_flows] #Converting a list of lists to a list of dicts
    
    #Printing
    index = 0
    for flow_dict in waiting_flows_dict:
        print index, flow_dict['flow_id'], flow_dict['node'], flow_dict['flow_state']
        index+=1
    
    return waiting_flows_dict
示例#8
0
 def test_explicit_endpoints(self):
     runner = EMRJobRunner(conf_path=False,
                           aws_region='EU',
                           s3_endpoint='s3-proxy',
                           emr_endpoint='emr-proxy')
     assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy')
     assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
示例#9
0
def find_all_flows(aws_access_key_id, aws_secret_access_key):
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,
                             aws_secret_access_key=aws_secret_access_key)
    print 'got job runner'
    emr_conn = JobRunner.make_emr_conn()
    print 'made EMR connection'
    return emr_conn.describe_jobflows()
示例#10
0
 def test_no_region(self):
     runner = EMRJobRunner(conf_path=False)
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
示例#11
0
def main(cl_args=None):
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    emr_job_flow_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(conf_paths=options.conf_paths)
    log.debug('Terminating job flow %s' % emr_job_flow_id)
    runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
    log.info('Terminated job flow %s' % emr_job_flow_id)
示例#12
0
def main(cl_args=None):
    # parser command-line args
    option_parser = _make_option_parser()
    options, args = option_parser.parse_args(cl_args)

    if len(args) != 1:
        option_parser.error('This tool takes exactly one argument.')
    cluster_id = args[0]

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(**_runner_kwargs(options))
    log.debug('Terminating cluster %s' % cluster_id)
    runner.make_emr_conn().terminate_jobflow(cluster_id)
    log.info('Terminated cluster %s' % cluster_id)
示例#13
0
 def test_blank_region(self):
     # blank region should be treated the same as no region
     runner = EMRJobRunner(conf_path=False, aws_region='')
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
示例#14
0
 def test_blank_region(self):
     # blank region should be treated the same as no region
     runner = EMRJobRunner(conf_path=False, aws_region='')
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
示例#15
0
def main():
    # parser command-line args
    option_parser = make_option_parser()
    options, args = option_parser.parse_args()

    if len(args) != 1:
        option_parser.error('takes exactly one argument')
    emr_job_flow_id = args[0]

    # set up logging
    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    # create the persistent job
    runner = EMRJobRunner(conf_path=options.conf_path)
    log.debug('Terminating job flow %s' % emr_job_flow_id)
    runner.make_emr_conn().terminate_jobflow(emr_job_flow_id)
    log.info('Terminated job flow %s' % emr_job_flow_id)
def find_waiting_flow(aws_access_key_id,aws_secret_access_key):
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    for flow in job_flows:
        if flow.state=='WAITING':
            print flow,flow.name,flow.jobflowid,flow.state
            job_id=flow.jobflowid
    return job_id
示例#17
0
def find_waiting_flow(aws_access_key_id,aws_secret_access_key,ssh_key_pair_file=''):
    print (aws_access_key_id,aws_secret_access_key)
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    emr_conn = JobRunner.make_emr_conn()
    job_flows=emr_conn.describe_jobflows()
    job_id='NONE'
    waiting_flows=[]
    for flow in job_flows:
        if flow.state=='WAITING':
            waiting_flows.append(flow)
            print flow.jobflowid,flow.state
            ip_address=flow.masterpublicdnsname
            if ssh_key_pair_file != '':
                print 'ssh -i %s hadoop@%s'%(ssh_key_pair_file,ip_address)
            job_id=flow.jobflowid
    return job_id
示例#18
0
文件: emr_test.py 项目: hblanks/mrjob
    def test_bootstrap_actions_get_added(self):
        bootstrap_actions = [
            's3://elasticmapreduce/bootstrap-actions/configure-hadoop -m,mapred.tasktracker.map.tasks.maximum=1',
            's3://foo/bar#xyzzy', # use alternate name for script
        ]

        runner = EMRJobRunner(conf_path=False,
                              bootstrap_actions=bootstrap_actions,
                              s3_sync_wait_time=0.01)

        job_flow_id = runner.make_persistent_job_flow()

        emr_conn = runner.make_emr_conn()
        job_flow = emr_conn.describe_jobflow(job_flow_id)
        actions = job_flow.bootstrapactions

        assert_equal(len(actions), 3)

        assert_equal(
            actions[0].path,
            's3://elasticmapreduce/bootstrap-actions/configure-hadoop')
        assert_equal(
            actions[0].args,
            ['-m,mapred.tasktracker.map.tasks.maximum=1'])
        assert_equal(actions[0].name, 'configure-hadoop')

        assert_equal(actions[1].path, 's3://foo/bar')
        assert_equal(actions[1].args, [])
        assert_equal(actions[1].name, 'xyzzy')

        # check for master bootstrap script
        assert actions[2].path.startswith('s3://mrjob-')
        assert actions[2].path.endswith('b.py')
        assert_equal(actions[2].args, [])
        assert_equal(actions[2].name, 'master')

        # make sure master bootstrap script is on S3
        assert runner.path_exists(actions[2].path)
示例#19
0
def find_all_flows(aws_access_key_id,aws_secret_access_key):
    JobRunner = EMRJobRunner(aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
    print 'got job runner'
    emr_conn = JobRunner.make_emr_conn()
    print 'made EMR connection'
    return emr_conn.describe_jobflows()
示例#20
0
def inspect_and_maybe_terminate_job_flows(conf_paths=None,
                                          dry_run=False,
                                          max_hours_idle=None,
                                          mins_to_end_of_hour=None,
                                          now=None,
                                          pool_name=None,
                                          pooled_only=False,
                                          unpooled_only=False,
                                          max_mins_locked=None,
                                          quiet=False,
                                          **kwargs):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_paths=conf_paths, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid, 'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):

                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None
                    and (pending or time_to_end_of_hour >=
                         timedelta(minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' %
        (num_running, num_bootstrapping, num_pending, num_idle,
         num_non_streaming, num_done))

    terminate_and_notify(runner,
                         to_terminate,
                         dry_run=dry_run,
                         max_mins_locked=max_mins_locked,
                         quiet=quiet)
示例#21
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = _list_all_steps(emr_conn, cluster_id)

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(_yield_all_bootstrap_actions(
            emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s to end of hour, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             strip_microseconds(time_to_end_of_hour),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(
                minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary.name,
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            time_to_end_of_hour=time_to_end_of_hour,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_done))
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
    max_mins_locked=None,
    quiet=False,
    **kwargs
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_path=conf_path, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug(
                'Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None and
                (pending or
                 time_to_end_of_hour >= timedelta(
                    minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' % (
        num_running, num_bootstrapping, num_pending, num_idle,
        num_non_streaming, num_done))

    terminate_and_notify(runner, to_terminate, dry_run=dry_run,
                         max_mins_locked=max_mins_locked, quiet=quiet)
示例#23
0
 def test_us_west_1(self):
     runner = EMRJobRunner(conf_path=False, aws_region='us-west-1')
     assert_equal(runner.make_emr_conn().endpoint,
                  'us-west-1.elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3-us-west-1.amazonaws.com')
示例#24
0
 def test_explicit_endpoints(self):
     runner = EMRJobRunner(conf_path=False, aws_region='EU',
                           s3_endpoint='s3-proxy', emr_endpoint='emr-proxy')
     assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy')
     assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
示例#25
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = _list_all_steps(emr_conn, cluster_id)

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        if _is_cluster_non_streaming(steps):
            num_non_streaming += 1
            continue

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary.name,
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d active non-streaming, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_non_streaming, num_done))
示例#26
0
 def test_us_west_1(self):
     runner = EMRJobRunner(conf_path=False, aws_region='us-west-1')
     assert_equal(runner.make_emr_conn().endpoint,
                  'us-west-1.elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3-us-west-1.amazonaws.com')