Exemplo n.º 1
0
def collect_active_job_flows(conf_paths):
    """Collect active job flow information from EMR.

    :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from,
                          or ``False`` to ignore all config files

    Return a list of job flows
    """
    emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn()
    active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']

    return describe_all_job_flows(emr_conn, states=active_states)
Exemplo n.º 2
0
def collect_active_job_flows(conf_paths):
    """Collect active job flow information from EMR.

    :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from,
                          or ``False`` to ignore all config files

    Return a list of job flows
    """
    emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn()
    active_states = ['STARTING', 'BOOTSTRAPPING', 'WAITING', 'RUNNING']

    return describe_all_job_flows(emr_conn, states=active_states)
Exemplo n.º 3
0
def inspect_and_maybe_terminate_job_flows(
    conf_path, max_hours_idle, now, dry_run):

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)

            # don't care about fractions of a second
            time_idle = timedelta(time_idle.days, time_idle.seconds)

            log.debug('Job flow %s (%s) idle for %s' %
                      (jf.jobflowid, jf.name, time_idle))
            if time_idle > timedelta(hours=max_hours_idle):
                to_terminate.append(
                    (jf.jobflowid, jf.name, time_idle))

    log.info(
        'Job flow statuses: %d running, %d idle, %d active non-streaming,'
        ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 4
0
def inspect_and_maybe_terminate_job_flows(
    conf_path, max_hours_idle, now, dry_run):

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)

            # don't care about fractions of a second
            time_idle = timedelta(time_idle.days, time_idle.seconds)

            log.debug('Job flow %s (%s) idle for %s' %
                      (jf.jobflowid, jf.name, time_idle))
            if time_idle > timedelta(hours=max_hours_idle):
                to_terminate.append(
                    (jf.jobflowid, jf.name, time_idle))

    log.info(
        'Job flow statuses: %d running, %d idle, %d active non-streaming,'
        ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 5
0
def main(args):
    option_parser = make_option_parser()
    options, args = option_parser.parse_args(args)

    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')
    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()
    job_flows = describe_all_job_flows(emr_conn, states=['RUNNING'])

    min_time = timedelta(hours=options.min_hours)

    job_info = find_long_running_jobs(job_flows, min_time)

    print_report(job_info)
Exemplo n.º 6
0
def get_job_flows(conf_path, max_days_ago=None, now=None):
    """Get relevant job flow information from EMR.

    :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or
                          ``False`` to ignore all config files.
    :param float max_days_ago: If set, don't fetch job flows created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    return describe_all_job_flows(emr_conn, created_after=created_after)
Exemplo n.º 7
0
def get_job_flows(conf_paths, max_days_ago=None, now=None):
    """Get relevant job flow information from EMR.

    :param str conf_path: Alternate path to read :py:mod:`mrjob.conf` from, or
                          ``False`` to ignore all config files.
    :param float max_days_ago: If set, don't fetch job flows created longer
                               than this many days ago.
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    emr_conn = EMRJobRunner(conf_paths=conf_paths).make_emr_conn()

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if max_days_ago is not None:
        created_after = now - timedelta(days=max_days_ago)

    return describe_all_job_flows(emr_conn, created_after=created_after)
Exemplo n.º 8
0
def main(args, now=None):
    if now is None:
        now = datetime.utcnow()

    option_parser = make_option_parser()
    options, args = option_parser.parse_args(args)

    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')
    emr_conn = EMRJobRunner(conf_paths=options.conf_paths).make_emr_conn()
    job_flows = describe_all_job_flows(emr_conn,
                                       states=['BOOTSTRAPPING', 'RUNNING'])

    min_time = timedelta(hours=options.min_hours)

    job_info = find_long_running_jobs(job_flows, min_time, now=now)

    print_report(job_info)
Exemplo n.º 9
0
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
Exemplo n.º 10
0
    def test_can_get_all_job_flows(self):
        now = datetime.datetime.utcnow()

        NUM_JOB_FLOWS = 2222
        assert_gt(NUM_JOB_FLOWS, DEFAULT_MAX_JOB_FLOWS_RETURNED)

        for i in range(NUM_JOB_FLOWS):
            jfid = 'j-%04d' % i
            self.mock_emr_job_flows[jfid] = MockEmrObject(
                creationdatetime=to_iso8601(now - datetime.timedelta(minutes=i)),
                jobflowid=jfid)

        emr_conn = EMRJobRunner().make_emr_conn()

        # ordinary describe_jobflows() hits the limit on number of job flows
        some_jfs = emr_conn.describe_jobflows()
        assert_equal(len(some_jfs), DEFAULT_MAX_JOB_FLOWS_RETURNED)

        all_jfs = describe_all_job_flows(emr_conn)
        assert_equal(len(all_jfs), NUM_JOB_FLOWS)
        assert_equal(sorted(jf.jobflowid for jf in all_jfs),
                     [('j-%04d' % i) for i in range(NUM_JOB_FLOWS)])
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
    max_mins_locked=None,
    quiet=False,
    **kwargs
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_path=conf_path, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug(
                'Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None and
                (pending or
                 time_to_end_of_hour >= timedelta(
                    minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' % (
        num_running, num_bootstrapping, num_pending, num_idle,
        num_non_streaming, num_done))

    terminate_and_notify(runner, to_terminate, dry_run=dry_run,
                         max_mins_locked=max_mins_locked, quiet=quiet)
Exemplo n.º 12
0
def print_report(options):

    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()

    log.info('getting job flow history...')
    # microseconds just make our report messy
    now = datetime.datetime.utcnow().replace(microsecond=0)

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if options.max_days_ago is not None:
        created_after = now - datetime.timedelta(days=options.max_days_ago)

    job_flows = describe_all_job_flows(emr_conn, created_after=created_after)

    job_flow_infos = []
    for jf in job_flows:
        job_flow_info = {}

        job_flow_info['id'] = jf.jobflowid

        job_flow_info['name'] = jf.name

        job_flow_info['created'] = to_datetime(jf.creationdatetime)

        start_time = to_datetime(getattr(jf, 'startdatetime', None))
        if start_time:
            end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now
            job_flow_info['ran'] = end_time - start_time
        else:
            job_flow_info['ran'] = datetime.timedelta(0)

        job_flow_info['state'] = jf.state

        job_flow_info['num_steps'] = len(jf.steps or [])

        # this looks to be an integer, but let's protect against
        # future changes
        job_flow_info['hours'] = float(jf.normalizedinstancehours)

        # estimate hours billed but not used
        job_flow_info['hours_bbnu'] = (
            job_flow_info['hours'] *
            estimate_proportion_billed_but_not_used(jf))

        # split out mr job name and user
        # jobs flows created by MRJob have names like:
        # mr_word_freq_count.dave.20101103.121249.638552
        match = JOB_NAME_RE.match(jf.name)
        if match:
            job_flow_info['mr_job_name'] = match.group(1)
            job_flow_info['user'] = match.group(2)
        else:
            # not run by mrjob
            job_flow_info['mr_job_name'] = None
            job_flow_info['user'] = None

        job_flow_infos.append(job_flow_info)

    if not job_flow_infos:
        print 'No job flows created in the past two months!'
        return

    earliest = min(info['created'] for info in job_flow_infos)
    latest = max(info['created'] for info in job_flow_infos)

    print 'Total # of Job Flows: %d' % len(job_flow_infos)
    print

    print '* All times are in UTC.'
    print

    print 'Min create time: %s' % earliest
    print 'Max create time: %s' % latest
    print '   Current time: %s' % now
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print

    # total compute-unit hours used
    total_hours = sum(info['hours'] for info in job_flow_infos)
    print 'Total Usage: %d' % total_hours
    print

    print '* Time billed but not used is estimated, and may not match'
    print "  Amazon's billing system exactly."
    print

    total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos)
    print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu
    print

    date_to_hours = defaultdict(float)
    date_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        date_created = info['created'].date()
        date_to_hours[date_created] += info['hours']
        date_to_hours_bbnu[date_created] += info['hours_bbnu']
    print 'Daily statistics:'
    print
    print ' date        usage     waste'
    d = latest.date()
    while d >= earliest.date():
        print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d])
        d -= datetime.timedelta(days=1)
    print

    def fmt(mr_job_name_or_user):
        if mr_job_name_or_user:
            return mr_job_name_or_user
        else:
            return '(not started by mrjob)'

    print '* Job flows are considered to belong to the user and job that'
    print '  started them (even if other jobs use the job flow).'
    print

    # Top jobs
    print 'Top jobs, by total usage:'
    mr_job_name_to_hours = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours[info['mr_job_name']] += info['hours']
    for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(mr_job_name))
    print

    print 'Top jobs, by time billed but not used:'
    mr_job_name_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu']
    for mr_job_name, hours_bbnu in sorted(
            mr_job_name_to_hours_bbnu.iteritems(), key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(mr_job_name))
    print

    # Top users
    print 'Top users, by total usage:'
    user_to_hours = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours[info['user']] += info['hours']
    for user, hours in sorted(user_to_hours.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(user))
    print

    print 'Top users, by time billed but not used:'
    user_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours_bbnu[info['user']] += info['hours_bbnu']
    for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(),
                                   key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(user))
    print

    # Top job flows
    print 'All job flows, by total usage:'
    top_job_flows = sorted(job_flow_infos,
                           key=lambda i: (-i['hours'], i['name']))
    for info in top_job_flows:
        print '  %6d %-15s %s' % (info['hours'], info['id'], info['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(job_flow_infos,
                                key=lambda i: (-i['hours_bbnu'], i['name']))
    for info in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (info['hours_bbnu'], info['id'],
                                    info['name'])
    print

    print 'Details for all job flows:'
    print
    print ' id              state         created             steps        time ran  usage     waste   user   name'

    all_job_flows = sorted(job_flow_infos,
                           key=lambda i: i['created'],
                           reverse=True)
    for info in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % (
            info['id'], info['state'], info['created'], info['num_steps'],
            info['ran'], info['hours'], info['hours_bbnu'],
            (info['user'] or ''), fmt(info['mr_job_name']))
Exemplo n.º 13
0
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)
            time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now)
            pool = job_flow_pool_name(jf)

            log.debug(
                'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' %
                (jf.jobflowid, strip_microseconds(time_idle),
                 strip_microseconds(time_to_end_of_hour),
                 ('unpooled' if pool is None else 'in %s pool' % pool),
                 jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):
                continue

            if (mins_to_end_of_hour is not None and time_to_end_of_hour >=
                    timedelta(minutes=mins_to_end_of_hour)):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append(
                (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour))

    log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,'
             ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 14
0
def print_report(options):

    emr_conn = EMRJobRunner(conf_path=options.conf_path).make_emr_conn()

    log.info('getting job flow history...')
    # microseconds just make our report messy
    now = datetime.datetime.utcnow().replace(microsecond=0)

    # if --max-days-ago is set, only look at recent jobs
    created_after = None
    if options.max_days_ago is not None:
        created_after = now - datetime.timedelta(days=options.max_days_ago)

    job_flows = describe_all_job_flows(emr_conn, created_after=created_after)

    job_flow_infos = []
    for jf in job_flows:
        job_flow_info = {}

        job_flow_info['id'] = jf.jobflowid

        job_flow_info['name'] = jf.name

        job_flow_info['created'] = to_datetime(jf.creationdatetime)

        start_time = to_datetime(getattr(jf, 'startdatetime', None))
        if start_time:
            end_time = to_datetime(getattr(jf, 'enddatetime', None)) or now
            job_flow_info['ran'] = end_time - start_time
        else:
            job_flow_info['ran'] = datetime.timedelta(0)

        job_flow_info['state'] = jf.state

        job_flow_info['num_steps'] = len(jf.steps or [])

        # this looks to be an integer, but let's protect against
        # future changes
        job_flow_info['hours'] = float(jf.normalizedinstancehours)

        # estimate hours billed but not used
        job_flow_info['hours_bbnu'] = (
            job_flow_info['hours'] *
            estimate_proportion_billed_but_not_used(jf))

        # split out mr job name and user
        # jobs flows created by MRJob have names like:
        # mr_word_freq_count.dave.20101103.121249.638552
        match = JOB_NAME_RE.match(jf.name)
        if match:
            job_flow_info['mr_job_name'] = match.group(1)
            job_flow_info['user'] = match.group(2)
        else:
            # not run by mrjob
            job_flow_info['mr_job_name'] = None
            job_flow_info['user'] = None

        job_flow_infos.append(job_flow_info)

    if not job_flow_infos:
        print 'No job flows created in the past two months!'
        return

    earliest = min(info['created'] for info in job_flow_infos)
    latest = max(info['created'] for info in job_flow_infos)

    print 'Total # of Job Flows: %d' % len(job_flow_infos)
    print

    print '* All times are in UTC.'
    print


    print 'Min create time: %s' % earliest
    print 'Max create time: %s' % latest
    print '   Current time: %s' % now
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print

    # total compute-unit hours used
    total_hours = sum(info['hours'] for info in job_flow_infos)
    print 'Total Usage: %d' % total_hours
    print

    print '* Time billed but not used is estimated, and may not match'
    print "  Amazon's billing system exactly."
    print

    total_hours_bbnu = sum(info['hours_bbnu'] for info in job_flow_infos)
    print 'Total time billed but not used (waste): %.2f' % total_hours_bbnu
    print

    date_to_hours = defaultdict(float)
    date_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        date_created = info['created'].date()
        date_to_hours[date_created] += info['hours']
        date_to_hours_bbnu[date_created] += info['hours_bbnu']
    print 'Daily statistics:'
    print
    print ' date        usage     waste'
    d = latest.date()
    while d >= earliest.date():
        print ' %10s %6d %9.2f' % (d, date_to_hours[d], date_to_hours_bbnu[d])
        d -= datetime.timedelta(days=1)
    print

    def fmt(mr_job_name_or_user):
        if mr_job_name_or_user:
            return mr_job_name_or_user
        else:
            return '(not started by mrjob)'

    print '* Job flows are considered to belong to the user and job that'
    print '  started them (even if other jobs use the job flow).'
    print

    # Top jobs
    print 'Top jobs, by total usage:'
    mr_job_name_to_hours = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours[info['mr_job_name']] += info['hours']
    for mr_job_name, hours in sorted(mr_job_name_to_hours.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(mr_job_name))
    print

    print 'Top jobs, by time billed but not used:'
    mr_job_name_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        mr_job_name_to_hours_bbnu[info['mr_job_name']] += info['hours_bbnu']
    for mr_job_name, hours_bbnu in sorted(mr_job_name_to_hours_bbnu.iteritems(),
                                     key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(mr_job_name))
    print

    # Top users
    print 'Top users, by total usage:'
    user_to_hours = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours[info['user']] += info['hours']
    for user, hours in sorted(user_to_hours.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %6d %s' % (hours, fmt(user))
    print

    print 'Top users, by time billed but not used:'
    user_to_hours_bbnu = defaultdict(float)
    for info in job_flow_infos:
        user_to_hours_bbnu[info['user']] += info['hours_bbnu']
    for user, hours_bbnu in sorted(user_to_hours_bbnu.iteritems(),
                              key=lambda (n, h): (-h, n)):
        print '  %9.2f %s' % (hours_bbnu, fmt(user))
    print

    # Top job flows
    print 'All job flows, by total usage:'
    top_job_flows = sorted(job_flow_infos,
                           key=lambda i: (-i['hours'], i['name']))
    for info in top_job_flows:
        print '  %6d %-15s %s' % (info['hours'], info['id'], info['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(job_flow_infos,
                           key=lambda i: (-i['hours_bbnu'], i['name']))
    for info in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (
            info['hours_bbnu'], info['id'], info['name'])
    print

    print 'Details for all job flows:'
    print
    print ' id              state         created             steps        time ran  usage     waste   user   name'

    all_job_flows = sorted(job_flow_infos, key=lambda i: i['created'],
                           reverse=True)
    for info in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %6d %9.2f %8s %s' % (
            info['id'], info['state'], info['created'], info['num_steps'],
            info['ran'], info['hours'], info['hours_bbnu'],
            (info['user'] or ''), fmt(info['mr_job_name']))
Exemplo n.º 15
0
def inspect_and_maybe_terminate_job_flows(conf_paths=None,
                                          dry_run=False,
                                          max_hours_idle=None,
                                          mins_to_end_of_hour=None,
                                          now=None,
                                          pool_name=None,
                                          pooled_only=False,
                                          unpooled_only=False,
                                          max_mins_locked=None,
                                          quiet=False,
                                          **kwargs):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_paths=conf_paths, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid, 'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):

                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None
                    and (pending or time_to_end_of_hour >=
                         timedelta(minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' %
        (num_running, num_bootstrapping, num_pending, num_idle,
         num_non_streaming, num_done))

    terminate_and_notify(runner,
                         to_terminate,
                         dry_run=dry_run,
                         max_mins_locked=max_mins_locked,
                         quiet=quiet)
Exemplo n.º 16
0
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)
            time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now)
            pool = job_flow_pool_name(jf)

            log.debug(
                'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            if (mins_to_end_of_hour is not None and
                time_to_end_of_hour >=
                    timedelta(minutes=mins_to_end_of_hour)):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append(
                (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d running, %d idle, %d active non-streaming,'
        ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)