Exemplo n.º 1
0
def main(args=None):
    now = _boto3_now()

    option_parser = _make_option_parser()
    options, args = option_parser.parse_args(args)

    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    job_info = _find_long_running_jobs(
        emr_client, cluster_summaries, min_time, now=now)

    _print_report(job_info)
Exemplo n.º 2
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # include RUNNING to catch clusters with PENDING jobs that
    # never ran (see #365).
    for cluster_summary in _boto3_paginate(
            'Clusters',
            emr_client,
            'list_clusters',
            ClusterStates=['WAITING', 'RUNNING']):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s (%s) - %s' % (
            cluster_id,
            'pending' if is_pending else 'idle',
            strip_microseconds(time_idle),
            ('unpooled' if pool is None else 'in %s pool' % pool),
            cluster_summary['Name'],
            'protected' if cluster['TerminationProtected'] else 'unprotected',
        ))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None
                and time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        if cluster['TerminationProtected']:
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Exemplo n.º 3
0
    def setUp(self):
        super(LogFetchingTestCase, self).setUp()

        self.runner = EMRJobRunner(conf_paths=[],
                                   s3_sync_wait_time=0,
                                   emr_job_flow_id='j-MOCKCLUSTER0')
Exemplo n.º 4
0
def main(args=None):
    """Run the create_job_flow tool with arguments from ``sys.argv`` and
    printing to ``sys.stdout``."""
    runner = EMRJobRunner(**runner_kwargs(args))
    emr_job_flow_id = runner.make_persistent_job_flow()
    print(emr_job_flow_id)
Exemplo n.º 5
0
def inspect_and_maybe_terminate_job_flows(conf_paths=None,
                                          dry_run=False,
                                          max_hours_idle=None,
                                          mins_to_end_of_hour=None,
                                          now=None,
                                          pool_name=None,
                                          pooled_only=False,
                                          unpooled_only=False,
                                          max_mins_locked=None,
                                          quiet=False,
                                          **kwargs):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_paths=conf_paths, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid, 'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):

                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None
                    and (pending or time_to_end_of_hour >=
                         timedelta(minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' %
        (num_running, num_bootstrapping, num_pending, num_idle,
         num_non_streaming, num_done))

    terminate_and_notify(runner,
                         to_terminate,
                         dry_run=dry_run,
                         max_mins_locked=max_mins_locked,
                         quiet=quiet)
Exemplo n.º 6
0
 def test_ap_southeast_1(self):
     runner = EMRJobRunner(conf_path=False, aws_region='ap-southeast-1')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3-ap-southeast-1.amazonaws.com')
     assert_raises(Exception, runner.make_emr_conn)
Exemplo n.º 7
0
 def make_runner(self):
     self.runner = EMRJobRunner(s3_sync_wait_time=0,
                                s3_scratch_uri='s3://walrus/tmp',
                                conf_path=False)
     self.runner._s3_job_log_uri = BUCKET_URI + LOG_DIR
Exemplo n.º 8
0
def main(args=None):
    """Run the create_cluster tool with arguments from ``sys.argv`` and
    printing to ``sys.stdout``."""
    runner = EMRJobRunner(**_runner_kwargs(args))
    cluster_id = runner.make_persistent_cluster()
    print(cluster_id)
Exemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    assignments = {
        option_parser: ('conf_path', 'quiet', 'verbose',
                        'ec2_key_pair_file')
    }

    mr_job = MRJob()
    job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
                         mr_job.proto_opt_group, mr_job.runner_opt_group,
                         mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
                         mr_job.hadoop_opts_opt_group)
    scrape_options_into_new_groups(job_option_groups, assignments)

    options, args = option_parser.parse_args()

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    if options.step_num:
        step_nums = [options.step_num]
    else:
        step_nums = None

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('quiet', 'verbose', 'list_relevant', 'list_all',
                       'cat_relevant', 'cat_all', 'get_counters', 'step_num',
                      'find_failure'):
        del runner_kwargs[unused_arg]

    with EMRJobRunner(emr_job_flow_id=args[0], **runner_kwargs) as runner:
        if options.list_relevant:
            list_relevant(runner, step_nums)

        if options.list_all:
            list_all(runner)

        if options.cat_relevant:
            cat_relevant(runner, step_nums)

        if options.cat_all:
            cat_all(runner)

        if options.get_counters:
            desc = runner._describe_jobflow()
            runner._set_s3_job_log_uri(desc)
            runner._fetch_counters(
                xrange(1, len(desc.steps) + 1), skip_s3_wait=True)
            runner.print_counters()

        if options.find_failure:
            find_failure(runner, options.step_num)
Exemplo n.º 10
0
    def test_pick_scratch_uri(self):
        self.add_mock_s3_data({'mrjob-walrus': {}, 'zebra': {}})
        runner = EMRJobRunner(conf_path=False)

        assert_equal(runner._opts['s3_scratch_uri'], 's3://mrjob-walrus/tmp/')
Exemplo n.º 11
0
 def test_explicit_endpoints(self):
     runner = EMRJobRunner(conf_path=False, aws_region='EU',
                           s3_endpoint='s3-proxy', emr_endpoint='emr-proxy')
     assert_equal(runner.make_emr_conn().endpoint, 'emr-proxy')
     assert_equal(runner.make_s3_conn().endpoint, 's3-proxy')
Exemplo n.º 12
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Exemplo n.º 13
0
def main():
    usage = '%prog [options]'
    description = (
        'Inspect available job flow pools or identify job flows suitable for'
        ' running a job with the specified options.')
    option_parser = OptionParser(usage=usage, description=description)

    import boto.emr.connection
    boto.emr.connection.JobFlow.Fields.add('HadoopVersion')

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    ec2_opt_group = make_option_group('EC2 instance configuration')
    hadoop_opt_group = make_option_group('Hadoop configuration')
    job_opt_group = make_option_group('Job flow configuration')

    assignments = {
        option_parser: (
            'conf_path',
            'emr_job_flow_pool_name',
            'quiet',
            'verbose',
        ),
        ec2_opt_group: (
            'aws_availability_zone',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'emr_endpoint',
            'num_ec2_instances',
        ),
        hadoop_opt_group: (
            'hadoop_version',
            'label',
            'owner',
        ),
        job_opt_group: (
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_mrjob',
            'bootstrap_python_packages',
        ),
    }

    option_parser.add_option('-a', '--all', action='store_true',
                             default=False, dest='list_all',
                             help=('List all available job flows without'
                                   ' filtering by configuration'))
    option_parser.add_option('-f', '--find', action='store_true',
                             default=False, dest='find',
                             help=('Find a job flow matching the pool name,'
                                   ' bootstrap configuration, and instance'
                                   ' number/type as specified on the command'
                                   ' line and in the configuration files'))
    option_parser.add_option('-t', '--terminate', action='store',
                             default=None, dest='terminate',
                             metavar='JOB_FLOW_ID',
                             help=('Terminate all job flows in the given pool'
                                   ' (defaults to pool "default")'))

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
    options, args = option_parser.parse_args()

    log_to_stream(name='mrjob', debug=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find',
                             'terminate'):
        del runner_kwargs[non_runner_kwarg]

    runner = EMRJobRunner(**runner_kwargs)

    if options.list_all:
        pprint_pools(runner)

    if options.find:
        sorted_job_flows = runner.usable_job_flows()

        if sorted_job_flows:
            jf = sorted_job_flows[-1]
            print 'You should use this one:'
            pprint_job_flow(jf)
        else:
            print 'No idle job flows match criteria'

    if options.terminate:
        terminate(runner, options.terminate)
Exemplo n.º 14
0
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)
            time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now)
            pool = job_flow_pool_name(jf)

            log.debug(
                'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' %
                (jf.jobflowid, strip_microseconds(time_idle),
                 strip_microseconds(time_to_end_of_hour),
                 ('unpooled' if pool is None else 'in %s pool' % pool),
                 jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):
                continue

            if (mins_to_end_of_hour is not None and time_to_end_of_hour >=
                    timedelta(minutes=mins_to_end_of_hour)):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append(
                (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour))

    log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,'
             ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 15
0
 def test_no_region(self):
     runner = EMRJobRunner(conf_path=False)
     assert_equal(runner.make_emr_conn().endpoint,
                  'elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint, 's3.amazonaws.com')
     assert_equal(runner._aws_region, '')
Exemplo n.º 16
0
def print_report(options):

    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()

    log.info('getting job flow history...')
    # microseconds just make our report messy
    now = datetime.datetime.utcnow().replace(microsecond=0)

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if options.max_days_ago is not None:
        created_after = now - datetime.timedelta(days=options.max_days_ago)

    job_flows = describe_all_job_flows(emr_conn, created_after=created_after)

    job_flow_infos = []
    for jf in job_flows:
        job_flow_info = {}

        job_flow_info['id'] = jf.jobflowid

        job_flow_info['name'] = jf.name

        job_flow_info['created'] = to_datetime(jf.creationdatetime)

        start_time = to_datetime(getattr(jf, 'startdatetime', None))
        if start_time:
            end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now
            job_flow_info['ran'] = end_time - start_time
        else:
            job_flow_info['ran'] = datetime.timedelta(0)

        job_flow_info['state'] = jf.state

        job_flow_info['num_steps'] = len(jf.steps or [])

        # this looks to be an integer, but let's protect against
        # future changes
        job_flow_info['hours'] = float(jf.normalizedinstancehours)

        # estimate hours billed but not used
        job_flow_info['hours_bbnu'] = (
            job_flow_info['hours'] *
            estimate_proportion_billed_but_not_used(jf))

        # split out mr job name and user
        # jobs flows created by MRJob have names like:
        # mr_word_freq_count.dave.20101103.121249.638552
        match = JOB_NAME_RE.match(jf.name)
        if match:
            job_flow_info['mr_job_name'] = match.group(1)
            job_flow_info['user'] = match.group(2)
        else:
            # not run by mrjob
            job_flow_info['mr_job_name'] = None
            job_flow_info['user'] = None

        job_flow_infos.append(job_flow_info)

    if not job_flow_infos:
        print 'No job flows created in the past two months!'
        return

    earliest = min(info['created'] for info in job_flow_infos)
    latest = max(info['created'] for info in job_flow_infos)

    print 'Total # of Job Flows: %d' % len(job_flow_infos)
    print

    print '* All times are in UTC.'
    print

    print 'Min create time: %s' % earliest
    print 'Max create time: %s' % latest
    print '   Current time: %s' % now
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print

    # total compute-unit hours used
    total_hours = sum(info['hours'] for info in job_flow_infos)
    print 'Total Usage: %d' % total_hours
    print

    print '* Time billed but not used is estimated, and may not match'
    print "  Amazon's billing system exactly."
    print

    total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos)
    print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu
    print

    date_to_hours = defaultdict(float)
    date_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        date_created = info['created'].date()
        date_to_hours[date_created] += info['hours']
        date_to_hours_bbnu[date_created] += info['hours_bbnu']
    print 'Daily statistics:'
    print
    print ' date        usage     waste'
    d = latest.date()
    while d >= earliest.date():
        print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d])
        d -= datetime.timedelta(days=1)
    print

    def fmt(mr_job_name_or_user):
        if mr_job_name_or_user:
            return mr_job_name_or_user
        else:
            return '(not started by mrjob)'

    print '* Job flows are considered to belong to the user and job that'
    print '  started them (even if other jobs use the job flow).'
    print

    # Top jobs
    print 'Top jobs, by total usage:'
    mr_job_name_to_hours = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours[info['mr_job_name']] += info['hours']
    for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(mr_job_name))
    print

    print 'Top jobs, by time billed but not used:'
    mr_job_name_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu']
    for mr_job_name, hours_bbnu in sorted(
            mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(mr_job_name))
    print

    # Top users
    print 'Top users, by total usage:'
    user_to_hours = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours[info['user']] += info['hours']
    for user, hours in sorted(user_to_hours.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(user))
    print

    print 'Top users, by time billed but not used:'
    user_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours_bbnu[info['user']] += info['hours_bbnu']
    for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(),
                                   key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(user))
    print

    # Top job flows
    print 'All job flows, by total usage:'
    top_job_flows = sorted(job_flow_infos,
                           key=lambda i: (-i['hours'], i['name']))
    for info in top_job_flows:
        print '  %6d %-15s %s' % (info['hours'], info['id'], info['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(job_flow_infos,
                                key=lambda i: (-i['hours_bbnu'], i['name']))
    for info in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (info['hours_bbnu'], info['id'],
                                    info['name'])
    print

    print 'Details for all job flows:'
    print
    print ' id              state         created             steps        time ran  usage     waste   user   name'

    all_job_flows = sorted(job_flow_infos,
                           key=lambda i: i['created'],
                           reverse=True)
    for info in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % (
            info['id'], info['state'], info['created'], info['num_steps'],
            info['ran'], info['hours'], info['hours_bbnu'],
            (info['user'] or ''), fmt(info['mr_job_name']))
Exemplo n.º 17
0
 def test_us_west_1(self):
     runner = EMRJobRunner(conf_path=False, aws_region='us-west-1')
     assert_equal(runner.make_emr_conn().endpoint,
                  'us-west-1.elasticmapreduce.amazonaws.com')
     assert_equal(runner.make_s3_conn().endpoint,
                  's3-us-west-1.amazonaws.com')
Exemplo n.º 18
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = _list_all_steps(emr_conn, cluster_id)

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        if _is_cluster_non_streaming(steps):
            num_non_streaming += 1
            continue

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary.name,
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d active non-streaming, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_non_streaming, num_done))
Exemplo n.º 19
0
    def test_cleanup(self):
        runner = EMRJobRunner(conf_paths=[], cloud_fs_sync_secs=0.01)

        # add some mock data

        # foo is current
        self.add_mock_s3_data({'walrus': {'data/foo': b'foo\n'}})

        # bar and baz are very old (but baz isn't in data/)
        self.add_mock_s3_data(
            {'walrus': {
                'data/bar': b'bar\n',
                'other/baz': b'baz\n'
            }},
            age=timedelta(days=45))

        # qux is a little more than two days old
        self.add_mock_s3_data({'walrus': {
            'data/qux': b'qux\n'
        }},
                              age=timedelta(hours=50))

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')),
            [
                's3://walrus/data/bar', 's3://walrus/data/foo',
                's3://walrus/data/qux', 's3://walrus/other/baz'
            ],
        )

        # try a dry run, which shouldn't delete anything
        _s3_cleanup('s3://walrus/data/',
                    timedelta(days=30),
                    dry_run=True,
                    conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')),
            [
                's3://walrus/data/bar',
                's3://walrus/data/foo',
                's3://walrus/data/qux',
                's3://walrus/other/baz',
            ],
        )
        # now do it for real. should hit bar (baz isn't in data/)
        _s3_cleanup('s3://walrus/data', timedelta(days=30), conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')),
            [
                's3://walrus/data/foo',
                's3://walrus/data/qux',
                's3://walrus/other/baz',
            ],
        )

        # now try to delete qux too
        _s3_cleanup('s3://walrus/data', timedelta(hours=48), conf_paths=[])

        self.assertEqual(
            sorted(runner.fs.ls('s3://walrus/')),
            [
                's3://walrus/data/foo',
                's3://walrus/other/baz',
            ],
        )