Exemplo n.º 1
0
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps,
                          is_pending, time_idle, time_to_end_of_hour,
                          dry_run=False, max_mins_locked=None, quiet=False):

    fmt = ('Terminated cluster %s (%s); was %s for %s, %s to end of hour')
    msg = fmt % (
        cluster_id, cluster_name,
        'pending' if is_pending else 'idle',
        strip_microseconds(time_idle),
        strip_microseconds(time_to_end_of_hour))

    did_terminate = False
    if dry_run:
        did_terminate = True
    else:
        status = _attempt_to_acquire_lock(
            runner.fs.make_s3_conn(),
            runner._lock_uri(cluster_id, num_steps),
            runner._opts['cloud_fs_sync_secs'],
            '%s (%s)' % (msg,
                         runner._make_unique_job_key(label='terminate')),
            mins_to_expiration=max_mins_locked,
        )
        if status:
            runner.make_emr_conn().terminate_jobflow(cluster_id)
            did_terminate = True
        elif not quiet:
            log.info('%s was locked between getting cluster info and'
                     ' trying to terminate it; skipping' % cluster_id)

    if did_terminate and not quiet:
        print(msg)
Exemplo n.º 2
0
def terminate_and_notify(runner,
                         to_terminate,
                         dry_run=False,
                         max_mins_locked=None,
                         quiet=False):
    if not to_terminate:
        return

    for jf, pending, time_idle, time_to_end_of_hour in to_terminate:
        fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of hour')
        msg = fmt % (jf.jobflowid, jf.name, 'pending' if pending else 'idle',
                     strip_microseconds(time_idle),
                     strip_microseconds(time_to_end_of_hour))

        did_terminate = False
        if not dry_run:
            status = attempt_to_acquire_lock(
                runner.make_s3_conn(),
                runner._lock_uri(jf),
                runner._opts['s3_sync_wait_time'],
                '%s (%s)' %
                (msg, runner._make_unique_job_key(label='terminate')),
                mins_to_expiration=max_mins_locked,
            )
            if status:
                runner.make_emr_conn().terminate_jobflow(jf.jobflowid)
                did_terminate = True
            elif not quiet:
                log.info('%s was locked between getting job flow info and'
                         ' trying to terminate it; skipping' % jf.jobflowid)

        if did_terminate and not quiet:
            print(msg)
def terminate_and_notify(runner, to_terminate, dry_run=False,
                         max_mins_locked=None, quiet=False):
    if not to_terminate:
        return

    for jf, pending, time_idle, time_to_end_of_hour in to_terminate:
        fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of hour')
        msg = fmt % (
                jf.jobflowid, jf.name,
                'pending' if pending else 'idle',
                strip_microseconds(time_idle),
                strip_microseconds(time_to_end_of_hour))

        did_terminate = False
        if not dry_run:
            status = attempt_to_acquire_lock(
                runner.make_s3_conn(),
                runner._lock_uri(jf),
                runner._opts['s3_sync_wait_time'],
                '%s (%s)' % (msg,
                             runner._make_unique_job_name(label='terminate')),
                mins_to_expiration=max_mins_locked,
            )
            if status:
                runner.make_emr_conn().terminate_jobflow(jf.jobflowid)
                did_terminate = True
            elif not quiet:
                log.info('%s was locked between getting job flow info and'
                         ' trying to terminate it; skipping' % jf.jobflowid)

        if did_terminate and not quiet:
            print msg
Exemplo n.º 4
0
def terminate_and_notify(runner, to_terminate, dry_run=False,
                         max_mins_locked=None, quiet=False):
    if not to_terminate:
        return

    for jf, pending, time_idle, time_to_end_of_hour in to_terminate:
        did_terminate = False
        if not dry_run:
            status = attempt_to_acquire_lock(
                runner.make_s3_conn(),
                runner._lock_uri(jf),
                runner._opts['s3_sync_wait_time'],
                runner._make_unique_job_name(label='terminate'),
                mins_to_expiration=max_mins_locked,
            )
            if status:
                runner.make_emr_conn().terminate_jobflow(jf.jobflowid)
                did_terminate = True

        if did_terminate and not quiet:
            fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of'
                   ' hour')
            print fmt % (
                    jf.jobflowid, jf.name,
                    'pending' if pending else 'idle',
                    strip_microseconds(time_idle),
                    strip_microseconds(time_to_end_of_hour))
Exemplo n.º 5
0
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps,
                          is_pending, time_idle, time_to_end_of_hour,
                          dry_run=False, max_mins_locked=None, quiet=False):

    fmt = ('Terminated cluster %s (%s); was %s for %s, %s to end of hour')
    msg = fmt % (
        cluster_id, cluster_name,
        'pending' if is_pending else 'idle',
        strip_microseconds(time_idle),
        strip_microseconds(time_to_end_of_hour))

    did_terminate = False
    if dry_run:
        did_terminate = True
    else:
        status = _attempt_to_acquire_lock(
            runner.fs,
            runner._lock_uri(cluster_id, num_steps),
            runner._opts['cloud_fs_sync_secs'],
            '%s (%s)' % (msg,
                         runner._make_unique_job_key(label='terminate')),
            mins_to_expiration=max_mins_locked,
        )
        if status:
            runner.make_emr_conn().terminate_jobflow(cluster_id)
            did_terminate = True
        elif not quiet:
            log.info('%s was locked between getting cluster info and'
                     ' trying to terminate it; skipping' % cluster_id)

    if did_terminate and not quiet:
        print(msg)
Exemplo n.º 6
0
def terminate_and_notify(emr_conn, to_terminate, dry_run=False):
    if not to_terminate:
        return

    for job_flow_id, name, time_idle, time_to_end_of_hour in to_terminate:
        if not dry_run:
            emr_conn.terminate_jobflow(job_flow_id)
        print ('Terminated job flow %s (%s); was idle for %s,'
               ' %s to end of hour' %
               (job_flow_id, name, strip_microseconds(time_idle),
                strip_microseconds(time_to_end_of_hour)))
Exemplo n.º 7
0
def terminate_and_notify(emr_conn, to_terminate, dry_run=False):
    if not to_terminate:
        return

    for job_flow_id, name, time_idle, time_to_end_of_hour in to_terminate:
        if not dry_run:
            emr_conn.terminate_jobflow(job_flow_id)
        print('Terminated job flow %s (%s); was idle for %s,'
              ' %s to end of hour' %
              (job_flow_id, name, strip_microseconds(time_idle),
               strip_microseconds(time_to_end_of_hour)))
def terminate_and_notify(emr_conn, to_terminate, dry_run=False):
    if not to_terminate:
        return

    for id, name, pending, time_idle, time_to_end_of_hour in to_terminate:
        if not dry_run:
            emr_conn.terminate_jobflow(id)
        print ('Terminated job flow %s (%s); was %s for %s, %s to end of hour'
               % (id, name,
                  'pending' if pending else 'idle',
                  strip_microseconds(time_idle),
                  strip_microseconds(time_to_end_of_hour)))
Exemplo n.º 9
0
def pprint_job_flow(jf):
    """Print a job flow to stdout in this form::

        job.flow.name
        j-JOB_FLOW_ID: 2 instances (master=m1.small, slaves=m1.small, 20 \
minutes to the hour)
    """
    instance_count = int(jf.instancecount)

    nosep_segments = [
        '%d instance' % instance_count,
    ]
    if instance_count > 1:
        nosep_segments.append('s')

    comma_segments = [
        'master=%s' % jf.masterinstancetype,
    ]

    if instance_count > 1:
        comma_segments.append('slaves=%s' % jf.slaveinstancetype)

    comma_segments.append('%s to end of hour' %
                          strip_microseconds(est_time_to_hour(jf)))

    nosep_segments += [
        ' (',
        ', '.join(comma_segments),
        ')',
    ]

    print '%s: %s' % (jf.jobflowid, jf.name)
    print ''.join(nosep_segments)
    print jf.state
    print
Exemplo n.º 10
0
def pprint_job_flow(jf):
    """Print a job flow to stdout in this form::

        job.flow.name
        j-JOB_FLOW_ID: 2 instances (master=m1.small, slaves=m1.small, 20 \
minutes to the hour)
    """
    instance_count = int(jf.instancecount)

    nosep_segments = [
        '%d instance' % instance_count,
    ]
    if instance_count > 1:
        nosep_segments.append('s')

    comma_segments = [
        'master=%s' % jf.masterinstancetype,
    ]

    if instance_count > 1:
        comma_segments.append('slaves=%s' % jf.slaveinstancetype)

    comma_segments.append('%s to end of hour' %
                          strip_microseconds(est_time_to_hour(jf)))

    nosep_segments += [
        ' (',
        ', '.join(comma_segments),
        ')',
    ]

    print('%s: %s' % (jf.jobflowid, jf.name))
    print(''.join(nosep_segments))
    print(jf.state)
    print()
Exemplo n.º 11
0
def _terminate_and_notify(runner,
                          cluster_id,
                          cluster_name,
                          num_steps,
                          is_pending,
                          time_idle,
                          dry_run=False,
                          max_mins_locked=None,
                          quiet=False):

    fmt = ('Terminated cluster %s (%s); was %s for %s')
    msg = fmt % (cluster_id, cluster_name, 'pending' if is_pending else 'idle',
                 strip_microseconds(time_idle))

    did_terminate = False
    if dry_run:
        did_terminate = True
    else:
        acquired_lock = runner._attempt_to_lock_cluster(cluster_id)
        if acquired_lock:
            runner.make_emr_client().terminate_job_flows(
                JobFlowIds=[cluster_id])
            did_terminate = True
            runner._release_any_cluster_lock_held()
        elif not quiet:
            log.info('%s was locked between getting cluster info and'
                     ' trying to terminate it; skipping' % cluster_id)

    if did_terminate and not quiet:
        print(msg)
Exemplo n.º 12
0
def _format_timedelta(time):
    """Format a timedelta for use in a columnar format. This just
    tweaks stuff like ``'3 days, 9:00:00'`` to line up with
    ``'3 days, 10:00:00'``
    """
    result = str(strip_microseconds(time))

    parts = result.split()
    if len(parts) == 3 and len(parts[-1]) == 7:
        return '%s %s  %s' % tuple(parts)
    else:
        return result
Exemplo n.º 13
0
def _format_timedelta(time):
    """Format a timedelta for use in a columnar format. This just
    tweaks stuff like ``'3 days, 9:00:00'`` to line up with
    ``'3 days, 10:00:00'``
    """
    result = str(strip_microseconds(time))

    parts = result.split()
    if len(parts) == 3 and len(parts[-1]) == 7:
        return '%s %s  %s' % tuple(parts)
    else:
        return result
Exemplo n.º 14
0
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps,
                          is_pending, time_idle,
                          dry_run=False, quiet=False):
    emr_client = runner.make_emr_client()

    if not dry_run:
        emr_client.terminate_job_flows(JobFlowIds=[cluster_id])

    if not quiet:
        fmt = ('Terminated cluster %s (%s); was %s for %s')
        msg = fmt % (
            cluster_id, cluster_name,
            'pending' if is_pending else 'idle',
            strip_microseconds(time_idle))

        print(msg)
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
    max_mins_locked=None,
    quiet=False,
    **kwargs
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_path=conf_path, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug(
                'Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None and
                (pending or
                 time_to_end_of_hour >= timedelta(
                    minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' % (
        num_running, num_bootstrapping, num_pending, num_idle,
        num_non_streaming, num_done))

    terminate_and_notify(runner, to_terminate, dry_run=dry_run,
                         max_mins_locked=max_mins_locked, quiet=quiet)
Exemplo n.º 16
0
def print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`clusters_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    s = stats

    if not s['clusters']:
        print('No job flows created in the past two months!')
        return

    print('Total  # of Job Flows: %d' % len(s['clusters']))
    print()

    print('* All times are in UTC.')
    print()

    print('Min create time: %s' % min(cs['created'] for cs in s['clusters']))
    print('Max create time: %s' % max(cs['created'] for cs in s['clusters']))
    print('   Current time: %s' % now.replace(microsecond=0))
    print()

    print('* All usage is measured in Normalized Instance Hours, which are')
    print('  roughly equivalent to running an m1.medium instance for an hour.')
    print("  Billing is estimated, and may not match Amazon's system exactly.")
    print()

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, percent(usage, s['nih_billed']))

    print('Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed']))
    print('  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used']))
    print('    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used']))
    print('    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used']))
    print('  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu']))
    print('    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu']))
    print('    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu']))
    print()

    if s['date_to_nih_billed']:
        print('Daily statistics:')
        print()
        print(' date          billed      used     waste   % waste')
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print(' %10s %9.2f %9.2f %9.2f     %5.1f' % (
                d,
                s['date_to_nih_billed'].get(d, 0.0),
                s['date_to_nih_used'].get(d, 0.0),
                s['date_to_nih_bbnu'].get(d, 0.0),
                percent(s['date_to_nih_bbnu'].get(d, 0.0),
                        s['date_to_nih_billed'].get(d, 0.0))))
            d -= timedelta(days=1)
        print()

    if s['hour_to_nih_billed']:
        print('Hourly statistics:')
        print()
        print(' hour              billed      used     waste   % waste')
        h = max(s['hour_to_nih_billed'])
        while h >= min(s['hour_to_nih_billed']):
            print(' %13s  %9.2f %9.2f %9.2f     %5.1f' % (
                h.strftime('%Y-%m-%d %H'),
                s['hour_to_nih_billed'].get(h, 0.0),
                s['hour_to_nih_used'].get(h, 0.0),
                s['hour_to_nih_bbnu'].get(h, 0.0),
                percent(s['hour_to_nih_bbnu'].get(h, 0.0),
                        s['hour_to_nih_billed'].get(h, 0.0))))
            h -= timedelta(hours=1)
        print()

    print('* Job flows are considered to belong to the user and job that')
    print('  started them or last ran on them.')
    print()

    # Top jobs
    print('Top jobs, by total time used:')
    for label, nih_used in sorted(s['label_to_nih_used'].items(),
                                  key=lambda lb_nih: (-lb_nih[1], lb_nih[0])):
        print('  %9.2f %s' % (nih_used, label))
    print()

    print('Top jobs, by time billed but not used:')
    for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(),
                                  key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])):
        print('  %9.2f %s' % (nih_bbnu, label))
    print()

    # Top users
    print('Top users, by total time used:')
    for owner, nih_used in sorted(s['owner_to_nih_used'].items(),
                                  key=lambda o_nih: (-o_nih[1], o_nih[0])):
        print('  %9.2f %s' % (nih_used, owner))
    print()

    print('Top users, by time billed but not used:')
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(),
                                  key=lambda o_nih2: (-o_nih2[1], o_nih2[0])):
        print('  %9.2f %s' % (nih_bbnu, owner))
    print()

    # Top job steps
    print('Top job steps, by total time used (step number first):')
    for (label, step_num), nih_used in sorted(
            s['job_step_to_nih_used'].items(),
            key=lambda k_nih: (-k_nih[1], k_nih[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_used, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_used,))
    print()

    print('Top job steps, by total time billed but not used (un-pooled only):')
    for (label, step_num), nih_bbnu in sorted(
            s['job_step_to_nih_bbnu_no_pool'].items(),
            key=lambda k_nih3: (-k_nih3[1], k_nih3[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_bbnu, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_bbnu,))
    print()

    # Top pools
    print('All pools, by total time billed:')
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(),
                                   key=lambda p_nih: (-p_nih[1], p_nih[0])):
        print('  %9.2f %s' % (nih_billed, pool or '(not pooled)'))
    print()

    print('All pools, by total time billed but not used:')
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(),
                                 key=lambda p_nih4: (-p_nih4[1], p_nih4[0])):
        print('  %9.2f %s' % (nih_bbnu, pool or '(not pooled)'))
    print()

    # Top job flows
    print('All job flows, by total time billed:')
    top_clusters = sorted(s['clusters'],
                          key=lambda cs: (-cs['nih_billed'], cs['name']))
    for cs in top_clusters:
        print('  %9.2f %-15s %s' % (
            cs['nih_billed'], cs['id'], cs['name']))
    print()

    print('All job flows, by time billed but not used:')
    top_clusters_bbnu = sorted(
        s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name']))
    for cs in top_clusters_bbnu:
        print('  %9.2f %-15s %s' % (
            cs['nih_bbnu'], cs['id'], cs['name']))
    print()

    # Details
    print('Details for all job flows:')
    print()
    print(' id              state                  created             steps'
          '        time ran     billed    waste   user   name')

    all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'],
                          reverse=True)

    for cs in all_clusters:
        print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % (
            cs['id'], cs['state'], cs['created'], cs['num_steps'],
            strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'],
            (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
Exemplo n.º 17
0
def print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`job_flows_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    s = stats

    if not s['flows']:
        print 'No job flows created in the past two months!'
        return

    print 'Total  # of Job Flows: %d' % len(s['flows'])
    print

    print '* All times are in UTC.'
    print

    print 'Min create time: %s' % min(jf['created'] for jf in s['flows'])
    print 'Max create time: %s' % max(jf['created'] for jf in s['flows'])
    print '   Current time: %s' % now.replace(microsecond=0)
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print "  Billing is estimated, and may not match Amazon's system exactly."
    print

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, percent(usage, s['nih_billed']))

    print 'Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed'])
    print '  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used'])
    print '    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used'])
    print '    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used'])
    print '  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu'])
    print '    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu'])
    print '    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu'])
    print

    if s['date_to_nih_billed']:
        print 'Daily statistics:'
        print
        print ' date          billed      used     waste   % waste'
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print ' %10s %9.2f %9.2f %9.2f     %5.1f' % (
                d, s['date_to_nih_billed'][d], s['date_to_nih_used'].get(
                    d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0),
                percent(s['date_to_nih_bbnu'].get(d, 0.0),
                        s['date_to_nih_billed'][d]))
            d -= timedelta(days=1)
        print

    print '* Job flows are considered to belong to the user and job that'
    print '  started them or last ran on them.'
    print

    # Top jobs
    print 'Top jobs, by total time used:'
    for label, nih_used in sorted(s['label_to_nih_used'].iteritems(),
                                  key=lambda (lb, nih): (-nih, lb)):
        print '  %9.2f %s' % (nih_used, label)
    print

    print 'Top jobs, by time billed but not used:'
    for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].iteritems(),
                                  key=lambda (lb, nih): (-nih, lb)):
        print '  %9.2f %s' % (nih_bbnu, label)
    print

    # Top users
    print 'Top users, by total time used:'
    for owner, nih_used in sorted(s['owner_to_nih_used'].iteritems(),
                                  key=lambda (o, nih): (-nih, o)):
        print '  %9.2f %s' % (nih_used, owner)
    print

    print 'Top users, by time billed but not used:'
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].iteritems(),
                                  key=lambda (o, nih): (-nih, o)):
        print '  %9.2f %s' % (nih_bbnu, owner)
    print

    # Top job steps
    print 'Top job steps, by total time used (step number first):'
    for (label,
         step_num), nih_used in sorted(s['job_step_to_nih_used'].iteritems(),
                                       key=lambda (k, nih): (-nih, k)):
        if label:
            print '  %9.2f %3d %s' % (nih_used, step_num, label)
        else:
            print '  %9.2f     (non-mrjob step)' % (nih_used, )
    print

    print 'Top job steps, by total time billed but not used (un-pooled only):'
    for (label, step_num), nih_bbnu in sorted(
            s['job_step_to_nih_bbnu_no_pool'].iteritems(),
            key=lambda (k, nih): (-nih, k)):

        if label:
            print '  %9.2f %3d %s' % (nih_bbnu, step_num, label)
        else:
            print '  %9.2f     (non-mrjob step)' % (nih_bbnu, )
    print

    # Top pools
    print 'All pools, by total time billed:'
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].iteritems(),
                                   key=lambda (p, nih): (-nih, p)):
        print '  %9.2f %s' % (nih_billed, pool or '(not pooled)')
    print

    print 'All pools, by total time billed but not used:'
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].iteritems(),
                                 key=lambda (p, nih): (-nih, p)):
        print '  %9.2f %s' % (nih_bbnu, pool or '(not pooled)')
    print

    # Top job flows
    print 'All job flows, by total time billed:'
    top_job_flows = sorted(s['flows'],
                           key=lambda jf: (-jf['nih_billed'], jf['name']))
    for jf in top_job_flows:
        print '  %9.2f %-15s %s' % (jf['nih_billed'], jf['id'], jf['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(s['flows'],
                                key=lambda jf: (-jf['nih_bbnu'], jf['name']))
    for jf in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (jf['nih_bbnu'], jf['id'], jf['name'])
    print

    # Details
    print 'Details for all job flows:'
    print
    print(
        ' id              state         created             steps'
        '        time ran     billed    waste   user   name')

    all_job_flows = sorted(s['flows'],
                           key=lambda jf: jf['created'],
                           reverse=True)
    for jf in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s' % (
            jf['id'], jf['state'], jf['created'], jf['num_steps'],
            strip_microseconds(jf['ran']), jf['nih_used'], jf['nih_bbnu'],
            (jf['owner'] or ''), (jf['label'] or ('not started by mrjob')))
Exemplo n.º 18
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate('Clusters', emr_client,
                                           'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Exemplo n.º 19
0
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)
            time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now)
            pool = job_flow_pool_name(jf)

            log.debug(
                'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' %
                (jf.jobflowid, strip_microseconds(time_idle),
                 strip_microseconds(time_to_end_of_hour),
                 ('unpooled' if pool is None else 'in %s pool' % pool),
                 jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):
                continue

            if (mins_to_end_of_hour is not None and time_to_end_of_hour >=
                    timedelta(minutes=mins_to_end_of_hour)):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append(
                (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour))

    log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,'
             ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 20
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = _list_all_steps(emr_conn, cluster_id)

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(_yield_all_bootstrap_actions(
            emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s to end of hour, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             strip_microseconds(time_to_end_of_hour),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(
                minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary.name,
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            time_to_end_of_hour=time_to_end_of_hour,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_done))
Exemplo n.º 21
0
def _print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`_clusters_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    s = stats

    if not s['clusters']:
        print('No clusters created in the past two months!')
        return

    print('Total  # of Clusters: %d' % len(s['clusters']))
    print()

    print('* All times are in UTC.')
    print()

    print('Min create time: %s' % min(cs['created'] for cs in s['clusters']))
    print('Max create time: %s' % max(cs['created'] for cs in s['clusters']))
    print('   Current time: %s' % now.replace(microsecond=0))
    print()

    print('* All usage is measured in Normalized Instance Hours, which are')
    print('  roughly equivalent to running an m1.medium instance for an hour.')
    print("  Billing is estimated, and may not match Amazon's system exactly.")
    print()

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, _percent(usage, s['nih_billed']))

    print('Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed']))
    print('  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used']))
    print('    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used']))
    print('    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used']))
    print('  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu']))
    print('    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu']))
    print('    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu']))
    print()

    if s['date_to_nih_billed']:
        print('Daily statistics:')
        print()
        print(' date          billed      used     waste   % waste')
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print(' %10s %9.2f %9.2f %9.2f     %5.1f' %
                  (d, s['date_to_nih_billed'].get(
                      d, 0.0), s['date_to_nih_used'].get(
                          d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0),
                   _percent(s['date_to_nih_bbnu'].get(d, 0.0),
                            s['date_to_nih_billed'].get(d, 0.0))))
            d -= timedelta(days=1)
        print()

    if s['hour_to_nih_billed']:
        print('Hourly statistics:')
        print()
        print(' hour              billed      used     waste   % waste')
        h = max(s['hour_to_nih_billed'])
        while h >= min(s['hour_to_nih_billed']):
            print(' %13s  %9.2f %9.2f %9.2f     %5.1f' %
                  (h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get(
                      h, 0.0), s['hour_to_nih_used'].get(
                          h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0),
                   _percent(s['hour_to_nih_bbnu'].get(h, 0.0),
                            s['hour_to_nih_billed'].get(h, 0.0))))
            h -= timedelta(hours=1)
        print()

    print('* clusters are considered to belong to the user and job that')
    print('  started them or last ran on them.')
    print()

    # Top jobs
    print('Top jobs, by total time used:')
    for label, nih_used in sorted(s['label_to_nih_used'].items(),
                                  key=lambda lb_nih: (-lb_nih[1], lb_nih[0])):
        print('  %9.2f %s' % (nih_used, label))
    print()

    print('Top jobs, by time billed but not used:')
    for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(),
                                  key=lambda lb_nih1:
                                  (-lb_nih1[1], lb_nih1[0])):
        print('  %9.2f %s' % (nih_bbnu, label))
    print()

    # Top users
    print('Top users, by total time used:')
    for owner, nih_used in sorted(s['owner_to_nih_used'].items(),
                                  key=lambda o_nih: (-o_nih[1], o_nih[0])):
        print('  %9.2f %s' % (nih_used, owner))
    print()

    print('Top users, by time billed but not used:')
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(),
                                  key=lambda o_nih2: (-o_nih2[1], o_nih2[0])):
        print('  %9.2f %s' % (nih_bbnu, owner))
    print()

    # Top job steps
    print('Top job steps, by total time used (step number first):')
    for (label,
         step_num), nih_used in sorted(s['job_step_to_nih_used'].items(),
                                       key=lambda k_nih:
                                       (-k_nih[1], k_nih[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_used, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_used, ))
    print()

    print('Top job steps, by total time billed but not used (un-pooled only):')
    for (label, step_num), nih_bbnu in sorted(
            s['job_step_to_nih_bbnu_no_pool'].items(),
            key=lambda k_nih3: (-k_nih3[1], k_nih3[0])):

        if label:
            print('  %9.2f %3d %s' % (nih_bbnu, step_num, label))
        else:
            print('  %9.2f     (non-mrjob step)' % (nih_bbnu, ))
    print()

    # Top pools
    print('All pools, by total time billed:')
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(),
                                   key=lambda p_nih: (-p_nih[1], p_nih[0])):
        print('  %9.2f %s' % (nih_billed, pool or '(not pooled)'))
    print()

    print('All pools, by total time billed but not used:')
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(),
                                 key=lambda p_nih4: (-p_nih4[1], p_nih4[0])):
        print('  %9.2f %s' % (nih_bbnu, pool or '(not pooled)'))
    print()

    # Top clusters
    print('All clusters, by total time billed:')
    top_clusters = sorted(s['clusters'],
                          key=lambda cs: (-cs['nih_billed'], cs['name']))
    for cs in top_clusters:
        print('  %9.2f %-15s %s' % (cs['nih_billed'], cs['id'], cs['name']))
    print()

    print('All clusters, by time billed but not used:')
    top_clusters_bbnu = sorted(s['clusters'],
                               key=lambda cs: (-cs['nih_bbnu'], cs['name']))
    for cs in top_clusters_bbnu:
        print('  %9.2f %-15s %s' % (cs['nih_bbnu'], cs['id'], cs['name']))
    print()

    # Details
    print('Details for all clusters:')
    print()
    print(' id              state                  created             steps'
          '        time ran     billed    waste   user   name')

    all_clusters = sorted(s['clusters'],
                          key=lambda cs: cs['created'],
                          reverse=True)

    for cs in all_clusters:
        print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' %
              (cs['id'], cs['state'], cs['created'], cs['num_steps'],
               strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'],
               (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
Exemplo n.º 22
0
def inspect_and_maybe_terminate_job_flows(conf_paths=None,
                                          dry_run=False,
                                          max_hours_idle=None,
                                          mins_to_end_of_hour=None,
                                          now=None,
                                          pool_name=None,
                                          pooled_only=False,
                                          unpooled_only=False,
                                          max_mins_locked=None,
                                          quiet=False,
                                          **kwargs):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(conf_paths=conf_paths, **kwargs)
    emr_conn = runner.make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # check if job flow is bootstrapping
        elif is_job_flow_bootstrapping(jf):
            num_bootstrapping += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif not is_job_flow_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            time_idle = now - time_last_active(jf)
            time_to_end_of_hour = est_time_to_hour(jf, now=now)
            _, pool = pool_hash_and_name(jf)
            pending = job_flow_has_pending_steps(jf)

            if pending:
                num_pending += 1
            else:
                num_idle += 1

            log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid, 'pending' if pending else 'idle',
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None
                    and time_idle <= timedelta(hours=max_hours_idle)):

                continue

            # mins_to_end_of_hour doesn't apply to jobs with pending steps
            if (mins_to_end_of_hour is not None
                    and (pending or time_to_end_of_hour >=
                         timedelta(minutes=mins_to_end_of_hour))):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append((jf, pending, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,'
        ' %d active non-streaming, %d done' %
        (num_running, num_bootstrapping, num_pending, num_idle,
         num_non_streaming, num_done))

    terminate_and_notify(runner,
                         to_terminate,
                         dry_run=dry_run,
                         max_mins_locked=max_mins_locked,
                         quiet=quiet)
Exemplo n.º 23
0
def print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`job_flows_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    s = stats

    if not s["flows"]:
        print "No job flows created in the past two months!"
        return

    print "Total  # of Job Flows: %d" % len(s["flows"])
    print

    print "* All times are in UTC."
    print

    print "Min create time: %s" % min(jf["created"] for jf in s["flows"])
    print "Max create time: %s" % max(jf["created"] for jf in s["flows"])
    print "   Current time: %s" % now.replace(microsecond=0)
    print

    print "* All usage is measured in Normalized Instance Hours, which are"
    print "  roughly equivalent to running an m1.small instance for an hour."
    print "  Billing is estimated, and may not match Amazon's system exactly."
    print

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, percent(usage, s["nih_billed"]))

    print "Total billed:  %9.2f  %5.1f%%" % with_pct(s["nih_billed"])
    print "  Total used:  %9.2f  %5.1f%%" % with_pct(s["nih_used"])
    print "    bootstrap: %9.2f  %5.1f%%" % with_pct(s["bootstrap_nih_used"])
    print "    jobs:      %9.2f  %5.1f%%" % with_pct(s["job_nih_used"])
    print "  Total waste: %9.2f  %5.1f%%" % with_pct(s["nih_bbnu"])
    print "    at end:    %9.2f  %5.1f%%" % with_pct(s["end_nih_bbnu"])
    print "    other:     %9.2f  %5.1f%%" % with_pct(s["other_nih_bbnu"])
    print

    if s["date_to_nih_billed"]:
        print "Daily statistics:"
        print
        print " date          billed      used     waste   % waste"
        d = max(s["date_to_nih_billed"])
        while d >= min(s["date_to_nih_billed"]):
            print " %10s %9.2f %9.2f %9.2f     %5.1f" % (
                d,
                s["date_to_nih_billed"].get(d, 0.0),
                s["date_to_nih_used"].get(d, 0.0),
                s["date_to_nih_bbnu"].get(d, 0.0),
                percent(s["date_to_nih_bbnu"].get(d, 0.0), s["date_to_nih_billed"].get(d, 0.0)),
            )
            d -= timedelta(days=1)
        print

    if s["hour_to_nih_billed"]:
        print "Hourly statistics:"
        print
        print " hour              billed      used     waste   % waste"
        h = max(s["hour_to_nih_billed"])
        while h >= min(s["hour_to_nih_billed"]):
            print " %13s  %9.2f %9.2f %9.2f     %5.1f" % (
                h.strftime("%Y-%m-%d %H"),
                s["hour_to_nih_billed"].get(h, 0.0),
                s["hour_to_nih_used"].get(h, 0.0),
                s["hour_to_nih_bbnu"].get(h, 0.0),
                percent(s["hour_to_nih_bbnu"].get(h, 0.0), s["hour_to_nih_billed"].get(h, 0.0)),
            )
            h -= timedelta(hours=1)
        print

    print "* Job flows are considered to belong to the user and job that"
    print "  started them or last ran on them."
    print

    # Top jobs
    print "Top jobs, by total time used:"
    for label, nih_used in sorted(s["label_to_nih_used"].iteritems(), key=lambda (lb, nih): (-nih, lb)):
        print "  %9.2f %s" % (nih_used, label)
    print

    print "Top jobs, by time billed but not used:"
    for label, nih_bbnu in sorted(s["label_to_nih_bbnu"].iteritems(), key=lambda (lb, nih): (-nih, lb)):
        print "  %9.2f %s" % (nih_bbnu, label)
    print

    # Top users
    print "Top users, by total time used:"
    for owner, nih_used in sorted(s["owner_to_nih_used"].iteritems(), key=lambda (o, nih): (-nih, o)):
        print "  %9.2f %s" % (nih_used, owner)
    print

    print "Top users, by time billed but not used:"
    for owner, nih_bbnu in sorted(s["owner_to_nih_bbnu"].iteritems(), key=lambda (o, nih): (-nih, o)):
        print "  %9.2f %s" % (nih_bbnu, owner)
    print

    # Top job steps
    print "Top job steps, by total time used (step number first):"
    for (label, step_num), nih_used in sorted(s["job_step_to_nih_used"].iteritems(), key=lambda (k, nih): (-nih, k)):
        if label:
            print "  %9.2f %3d %s" % (nih_used, step_num, label)
        else:
            print "  %9.2f     (non-mrjob step)" % (nih_used,)
    print

    print "Top job steps, by total time billed but not used (un-pooled only):"
    for (label, step_num), nih_bbnu in sorted(
        s["job_step_to_nih_bbnu_no_pool"].iteritems(), key=lambda (k, nih): (-nih, k)
    ):

        if label:
            print "  %9.2f %3d %s" % (nih_bbnu, step_num, label)
        else:
            print "  %9.2f     (non-mrjob step)" % (nih_bbnu,)
    print

    # Top pools
    print "All pools, by total time billed:"
    for pool, nih_billed in sorted(s["pool_to_nih_billed"].iteritems(), key=lambda (p, nih): (-nih, p)):
        print "  %9.2f %s" % (nih_billed, pool or "(not pooled)")
    print

    print "All pools, by total time billed but not used:"
    for pool, nih_bbnu in sorted(s["pool_to_nih_bbnu"].iteritems(), key=lambda (p, nih): (-nih, p)):
        print "  %9.2f %s" % (nih_bbnu, pool or "(not pooled)")
    print

    # Top job flows
    print "All job flows, by total time billed:"
    top_job_flows = sorted(s["flows"], key=lambda jf: (-jf["nih_billed"], jf["name"]))
    for jf in top_job_flows:
        print "  %9.2f %-15s %s" % (jf["nih_billed"], jf["id"], jf["name"])
    print

    print "All job flows, by time billed but not used:"
    top_job_flows_bbnu = sorted(s["flows"], key=lambda jf: (-jf["nih_bbnu"], jf["name"]))
    for jf in top_job_flows_bbnu:
        print "  %9.2f %-15s %s" % (jf["nih_bbnu"], jf["id"], jf["name"])
    print

    # Details
    print "Details for all job flows:"
    print
    print (
        " id              state         created             steps" "        time ran     billed    waste   user   name"
    )

    all_job_flows = sorted(s["flows"], key=lambda jf: jf["created"], reverse=True)
    for jf in all_job_flows:
        print " %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s" % (
            jf["id"],
            jf["state"],
            jf["created"],
            jf["num_steps"],
            strip_microseconds(jf["ran"]),
            jf["nih_used"],
            jf["nih_bbnu"],
            (jf["owner"] or ""),
            (jf["label"] or ("not started by mrjob")),
        )
Exemplo n.º 24
0
def inspect_and_maybe_terminate_job_flows(
    conf_path=None,
    dry_run=False,
    max_hours_idle=None,
    mins_to_end_of_hour=None,
    now=None,
    pool_name=None,
    pooled_only=False,
    unpooled_only=False,
):

    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = DEFAULT_MAX_HOURS_IDLE

    emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn()

    log.info(
        'getting info about all job flows (this goes back about 2 months)')
    # We don't filter by job flow state because we want this to work even
    # if Amazon adds another kind of idle state.
    job_flows = describe_all_job_flows(emr_conn)

    num_running = 0
    num_idle = 0
    num_done = 0
    num_non_streaming = 0
    # a list of tuples of job flow id, name, idle time (as a timedelta)
    to_terminate = []

    for jf in job_flows:

        # check if job flow is done
        if is_job_flow_done(jf):
            num_done += 1

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        elif is_job_flow_non_streaming(jf):
            num_non_streaming += 1

        elif is_job_flow_running(jf):
            num_running += 1

        else:
            num_idle += 1
            time_idle = time_job_flow_idle(jf, now=now)
            time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now)
            pool = job_flow_pool_name(jf)

            log.debug(
                'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' %
                      (jf.jobflowid,
                       strip_microseconds(time_idle),
                       strip_microseconds(time_to_end_of_hour),
                       ('unpooled' if pool is None else 'in %s pool' % pool),
                       jf.name))

            # filter out job flows that don't meet our criteria
            if (max_hours_idle is not None and
                time_idle <= timedelta(hours=max_hours_idle)):
                continue

            if (mins_to_end_of_hour is not None and
                time_to_end_of_hour >=
                    timedelta(minutes=mins_to_end_of_hour)):
                continue

            if (pooled_only and pool is None):
                continue

            if (unpooled_only and pool is not None):
                continue

            if (pool_name is not None and pool != pool_name):
                continue

            to_terminate.append(
                (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour))

    log.info(
        'Job flow statuses: %d running, %d idle, %d active non-streaming,'
        ' %d done' % (num_running, num_idle, num_non_streaming, num_done))

    terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
Exemplo n.º 25
0
def _round_up_to_next_second(td):
    """Round up to the next second because that's how EMR bills."""
    if td.microseconds:
        return strip_microseconds(td) + timedelta(seconds=1)
    else:
        return td
Exemplo n.º 26
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _boto3_paginate(
            'Clusters', emr_client, 'list_clusters'):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(reversed(list(_boto3_paginate(
            'Steps', emr_client, 'list_steps',
            ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug(
            'cluster %s %s for %s, %s (%s)' %
            (cluster_id,
             'pending' if is_pending else 'idle',
             strip_microseconds(time_idle),
             ('unpooled' if pool is None else 'in %s pool' % pool),
             cluster_summary['Name']))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None and
                time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(
            runner=runner,
            cluster_id=cluster_id,
            cluster_name=cluster_summary['Name'],
            num_steps=len(steps),
            is_pending=is_pending,
            time_idle=time_idle,
            dry_run=dry_run,
            max_mins_locked=max_mins_locked,
            quiet=quiet)

    log.info(
        'Cluster statuses: %d starting, %d bootstrapping, %d running,'
        ' %d pending, %d idle, %d done' % (
            num_starting, num_bootstrapping, num_running,
            num_pending, num_idle, num_done))
Exemplo n.º 27
0
def _maybe_terminate_clusters(dry_run=False,
                              max_mins_idle=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = _boto3_now()

    # old default behavior
    if max_mins_idle is None:
        max_mins_idle = _DEFAULT_MAX_MINS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_client = runner.make_emr_client()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_pending = 0
    num_running = 0

    # include RUNNING to catch clusters with PENDING jobs that
    # never ran (see #365).
    for cluster_summary in _boto3_paginate(
            'Clusters',
            emr_client,
            'list_clusters',
            ClusterStates=['WAITING', 'RUNNING']):

        cluster_id = cluster_summary['Id']

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = list(
            reversed(
                list(
                    _boto3_paginate('Steps',
                                    emr_client,
                                    'list_steps',
                                    ClusterId=cluster_id))))

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        is_pending = _cluster_has_pending_steps(steps)

        # need to get actual cluster to see tags
        cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster']

        _, pool = _pool_hash_and_name(cluster)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s (%s) - %s' % (
            cluster_id,
            'pending' if is_pending else 'idle',
            strip_microseconds(time_idle),
            ('unpooled' if pool is None else 'in %s pool' % pool),
            cluster_summary['Name'],
            'protected' if cluster['TerminationProtected'] else 'unprotected',
        ))

        # filter out clusters that don't meet our criteria
        if (max_mins_idle is not None
                and time_idle <= timedelta(minutes=max_mins_idle)):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        if cluster['TerminationProtected']:
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary['Name'],
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_done))
Exemplo n.º 28
0
def _round_up_to_next_second(td):
    """Round up to the next second because that's how EMR bills."""
    if td.microseconds:
        return strip_microseconds(td) + timedelta(seconds=1)
    else:
        return td
Exemplo n.º 29
0
def _maybe_terminate_clusters(dry_run=False,
                              max_hours_idle=None,
                              mins_to_end_of_hour=None,
                              now=None,
                              pool_name=None,
                              pooled_only=False,
                              unpooled_only=False,
                              max_mins_locked=None,
                              quiet=False,
                              **kwargs):
    if now is None:
        now = datetime.utcnow()

    # old default behavior
    if max_hours_idle is None and mins_to_end_of_hour is None:
        max_hours_idle = _DEFAULT_MAX_HOURS_IDLE

    runner = EMRJobRunner(**kwargs)
    emr_conn = runner.make_emr_conn()

    num_starting = 0
    num_bootstrapping = 0
    num_done = 0
    num_idle = 0
    num_non_streaming = 0
    num_pending = 0
    num_running = 0

    # We don't filter by cluster state because we want this to work even
    # if Amazon adds another kind of idle state.
    for cluster_summary in _yield_all_clusters(emr_conn):
        cluster_id = cluster_summary.id

        # check if cluster is done
        if _is_cluster_done(cluster_summary):
            num_done += 1
            continue

        # check if cluster is starting
        if _is_cluster_starting(cluster_summary):
            num_starting += 1
            continue

        # check if cluster is bootstrapping
        if _is_cluster_bootstrapping(cluster_summary):
            num_bootstrapping += 1
            continue

        # need steps to learn more about cluster
        steps = _list_all_steps(emr_conn, cluster_id)

        # we can't really tell if non-streaming jobs are idle or not, so
        # let them be (see Issue #60)
        if _is_cluster_non_streaming(steps):
            num_non_streaming += 1
            continue

        if any(_is_step_running(step) for step in steps):
            num_running += 1
            continue

        # cluster is idle
        time_idle = now - _time_last_active(cluster_summary, steps)
        time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now)
        is_pending = _cluster_has_pending_steps(steps)

        bootstrap_actions = list(
            _yield_all_bootstrap_actions(emr_conn, cluster_id))
        _, pool = _pool_hash_and_name(bootstrap_actions)

        if is_pending:
            num_pending += 1
        else:
            num_idle += 1

        log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' %
                  (cluster_id, 'pending' if is_pending else 'idle',
                   strip_microseconds(time_idle),
                   strip_microseconds(time_to_end_of_hour),
                   ('unpooled' if pool is None else 'in %s pool' % pool),
                   cluster_summary.name))

        # filter out clusters that don't meet our criteria
        if (max_hours_idle is not None
                and time_idle <= timedelta(hours=max_hours_idle)):
            continue

        # mins_to_end_of_hour doesn't apply to jobs with pending steps
        if (mins_to_end_of_hour is not None and
            (is_pending or
             time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))):
            continue

        if (pooled_only and pool is None):
            continue

        if (unpooled_only and pool is not None):
            continue

        if (pool_name is not None and pool != pool_name):
            continue

        # terminate idle cluster
        _terminate_and_notify(runner=runner,
                              cluster_id=cluster_id,
                              cluster_name=cluster_summary.name,
                              num_steps=len(steps),
                              is_pending=is_pending,
                              time_idle=time_idle,
                              time_to_end_of_hour=time_to_end_of_hour,
                              dry_run=dry_run,
                              max_mins_locked=max_mins_locked,
                              quiet=quiet)

    log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,'
             ' %d pending, %d idle, %d active non-streaming, %d done' %
             (num_starting, num_bootstrapping, num_running, num_pending,
              num_idle, num_non_streaming, num_done))
Exemplo n.º 30
0
def print_report(stats, now=None):
    """Print final report.

    :param stats: a dictionary returned by :py:func:`job_flows_to_stats`
    :param now: the current UTC time, as a :py:class:`datetime.datetime`.
                Defaults to the current time.
    """
    if now is None:
        now = datetime.utcnow()

    s = stats

    if not s['flows']:
        print 'No job flows created in the past two months!'
        return

    print 'Total  # of Job Flows: %d' % len(s['flows'])
    print

    print '* All times are in UTC.'
    print

    print 'Min create time: %s' % min(jf['created'] for jf in s['flows'])
    print 'Max create time: %s' % max(jf['created'] for jf in s['flows'])
    print '   Current time: %s' % now.replace(microsecond=0)
    print

    print '* All usage is measured in Normalized Instance Hours, which are'
    print '  roughly equivalent to running an m1.small instance for an hour.'
    print "  Billing is estimated, and may not match Amazon's system exactly."
    print

    # total compute-unit hours used
    def with_pct(usage):
        return (usage, percent(usage, s['nih_billed']))

    print 'Total billed:  %9.2f  %5.1f%%' % with_pct(s['nih_billed'])
    print '  Total used:  %9.2f  %5.1f%%' % with_pct(s['nih_used'])
    print '    bootstrap: %9.2f  %5.1f%%' % with_pct(s['bootstrap_nih_used'])
    print '    jobs:      %9.2f  %5.1f%%' % with_pct(s['job_nih_used'])
    print '  Total waste: %9.2f  %5.1f%%' % with_pct(s['nih_bbnu'])
    print '    at end:    %9.2f  %5.1f%%' % with_pct(s['end_nih_bbnu'])
    print '    other:     %9.2f  %5.1f%%' % with_pct(s['other_nih_bbnu'])
    print

    if s['date_to_nih_billed']:
        print 'Daily statistics:'
        print
        print ' date          billed      used     waste   % waste'
        d = max(s['date_to_nih_billed'])
        while d >= min(s['date_to_nih_billed']):
            print ' %10s %9.2f %9.2f %9.2f     %5.1f' % (
                d,
                s['date_to_nih_billed'][d],
                s['date_to_nih_used'].get(d, 0.0),
                s['date_to_nih_bbnu'].get(d, 0.0),
                percent(s['date_to_nih_bbnu'].get(d, 0.0),
                        s['date_to_nih_billed'][d]))
            d -= timedelta(days=1)
        print

    print '* Job flows are considered to belong to the user and job that'
    print '  started them or last ran on them.'
    print

    # Top jobs
    print 'Top jobs, by total time used:'
    for label, nih_used in sorted(s['label_to_nih_used'].iteritems(),
                                    key=lambda (lb, nih): (-nih, lb)):
        print '  %9.2f %s' % (nih_used, label)
    print

    print 'Top jobs, by time billed but not used:'
    for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].iteritems(),
                                  key=lambda (lb, nih): (-nih, lb)):
        print '  %9.2f %s' % (nih_bbnu, label)
    print

    # Top users
    print 'Top users, by total time used:'
    for owner, nih_used in sorted(s['owner_to_nih_used'].iteritems(),
                                    key=lambda (o, nih): (-nih, o)):
        print '  %9.2f %s' % (nih_used, owner)
    print

    print 'Top users, by time billed but not used:'
    for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].iteritems(),
                                  key=lambda (o, nih): (-nih, o)):
        print '  %9.2f %s' % (nih_bbnu, owner)
    print

    # Top job steps
    print 'Top job steps, by total time used (step number first):'
    for (label, step_num), nih_used in sorted(
        s['job_step_to_nih_used'].iteritems(), key=lambda (k, nih): (-nih, k)):
        if label:
            print '  %9.2f %3d %s' % (nih_used, step_num, label)
        else:
            print '  %9.2f     (non-mrjob step)' % (nih_used,)
    print

    print 'Top job steps, by total time billed but not used (un-pooled only):'
    for (label, step_num), nih_bbnu in sorted(
        s['job_step_to_nih_bbnu_no_pool'].iteritems(),
        key=lambda (k, nih): (-nih, k)):

        if label:
            print '  %9.2f %3d %s' % (nih_bbnu, step_num, label)
        else:
            print '  %9.2f     (non-mrjob step)' % (nih_bbnu,)
    print

    # Top pools
    print 'All pools, by total time billed:'
    for pool, nih_billed in sorted(s['pool_to_nih_billed'].iteritems(),
                                   key=lambda (p, nih): (-nih, p)):
        print '  %9.2f %s' % (nih_billed, pool or '(not pooled)')
    print

    print 'All pools, by total time billed but not used:'
    for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].iteritems(),
                                 key=lambda (p, nih): (-nih, p)):
        print '  %9.2f %s' % (nih_bbnu, pool or '(not pooled)')
    print

    # Top job flows
    print 'All job flows, by total time billed:'
    top_job_flows = sorted(s['flows'],
                           key=lambda jf: (-jf['nih_billed'], jf['name']))
    for jf in top_job_flows:
        print '  %9.2f %-15s %s' % (
            jf['nih_billed'], jf['id'], jf['name'])
    print

    print 'All job flows, by time billed but not used:'
    top_job_flows_bbnu = sorted(s['flows'],
                                key=lambda jf: (-jf['nih_bbnu'], jf['name']))
    for jf in top_job_flows_bbnu:
        print '  %9.2f %-15s %s' % (
            jf['nih_bbnu'], jf['id'], jf['name'])
    print

    # Details
    print 'Details for all job flows:'
    print
    print (' id              state         created             steps'
           '        time ran     billed    waste   user   name')

    all_job_flows = sorted(s['flows'], key=lambda jf: jf['created'],
                           reverse=True)
    for jf in all_job_flows:
        print ' %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s' % (
            jf['id'], jf['state'], jf['created'], jf['num_steps'],
            strip_microseconds(jf['ran']), jf['nih_used'], jf['nih_bbnu'],
            (jf['owner'] or ''), (jf['label'] or ('not started by mrjob')))