def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps, is_pending, time_idle, time_to_end_of_hour, dry_run=False, max_mins_locked=None, quiet=False): fmt = ('Terminated cluster %s (%s); was %s for %s, %s to end of hour') msg = fmt % ( cluster_id, cluster_name, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)) did_terminate = False if dry_run: did_terminate = True else: status = _attempt_to_acquire_lock( runner.fs.make_s3_conn(), runner._lock_uri(cluster_id, num_steps), runner._opts['cloud_fs_sync_secs'], '%s (%s)' % (msg, runner._make_unique_job_key(label='terminate')), mins_to_expiration=max_mins_locked, ) if status: runner.make_emr_conn().terminate_jobflow(cluster_id) did_terminate = True elif not quiet: log.info('%s was locked between getting cluster info and' ' trying to terminate it; skipping' % cluster_id) if did_terminate and not quiet: print(msg)
def terminate_and_notify(runner, to_terminate, dry_run=False, max_mins_locked=None, quiet=False): if not to_terminate: return for jf, pending, time_idle, time_to_end_of_hour in to_terminate: fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of hour') msg = fmt % (jf.jobflowid, jf.name, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)) did_terminate = False if not dry_run: status = attempt_to_acquire_lock( runner.make_s3_conn(), runner._lock_uri(jf), runner._opts['s3_sync_wait_time'], '%s (%s)' % (msg, runner._make_unique_job_key(label='terminate')), mins_to_expiration=max_mins_locked, ) if status: runner.make_emr_conn().terminate_jobflow(jf.jobflowid) did_terminate = True elif not quiet: log.info('%s was locked between getting job flow info and' ' trying to terminate it; skipping' % jf.jobflowid) if did_terminate and not quiet: print(msg)
def terminate_and_notify(runner, to_terminate, dry_run=False, max_mins_locked=None, quiet=False): if not to_terminate: return for jf, pending, time_idle, time_to_end_of_hour in to_terminate: fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of hour') msg = fmt % ( jf.jobflowid, jf.name, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)) did_terminate = False if not dry_run: status = attempt_to_acquire_lock( runner.make_s3_conn(), runner._lock_uri(jf), runner._opts['s3_sync_wait_time'], '%s (%s)' % (msg, runner._make_unique_job_name(label='terminate')), mins_to_expiration=max_mins_locked, ) if status: runner.make_emr_conn().terminate_jobflow(jf.jobflowid) did_terminate = True elif not quiet: log.info('%s was locked between getting job flow info and' ' trying to terminate it; skipping' % jf.jobflowid) if did_terminate and not quiet: print msg
def terminate_and_notify(runner, to_terminate, dry_run=False, max_mins_locked=None, quiet=False): if not to_terminate: return for jf, pending, time_idle, time_to_end_of_hour in to_terminate: did_terminate = False if not dry_run: status = attempt_to_acquire_lock( runner.make_s3_conn(), runner._lock_uri(jf), runner._opts['s3_sync_wait_time'], runner._make_unique_job_name(label='terminate'), mins_to_expiration=max_mins_locked, ) if status: runner.make_emr_conn().terminate_jobflow(jf.jobflowid) did_terminate = True if did_terminate and not quiet: fmt = ('Terminated job flow %s (%s); was %s for %s, %s to end of' ' hour') print fmt % ( jf.jobflowid, jf.name, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour))
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps, is_pending, time_idle, time_to_end_of_hour, dry_run=False, max_mins_locked=None, quiet=False): fmt = ('Terminated cluster %s (%s); was %s for %s, %s to end of hour') msg = fmt % ( cluster_id, cluster_name, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)) did_terminate = False if dry_run: did_terminate = True else: status = _attempt_to_acquire_lock( runner.fs, runner._lock_uri(cluster_id, num_steps), runner._opts['cloud_fs_sync_secs'], '%s (%s)' % (msg, runner._make_unique_job_key(label='terminate')), mins_to_expiration=max_mins_locked, ) if status: runner.make_emr_conn().terminate_jobflow(cluster_id) did_terminate = True elif not quiet: log.info('%s was locked between getting cluster info and' ' trying to terminate it; skipping' % cluster_id) if did_terminate and not quiet: print(msg)
def terminate_and_notify(emr_conn, to_terminate, dry_run=False): if not to_terminate: return for job_flow_id, name, time_idle, time_to_end_of_hour in to_terminate: if not dry_run: emr_conn.terminate_jobflow(job_flow_id) print ('Terminated job flow %s (%s); was idle for %s,' ' %s to end of hour' % (job_flow_id, name, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)))
def terminate_and_notify(emr_conn, to_terminate, dry_run=False): if not to_terminate: return for job_flow_id, name, time_idle, time_to_end_of_hour in to_terminate: if not dry_run: emr_conn.terminate_jobflow(job_flow_id) print('Terminated job flow %s (%s); was idle for %s,' ' %s to end of hour' % (job_flow_id, name, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)))
def terminate_and_notify(emr_conn, to_terminate, dry_run=False): if not to_terminate: return for id, name, pending, time_idle, time_to_end_of_hour in to_terminate: if not dry_run: emr_conn.terminate_jobflow(id) print ('Terminated job flow %s (%s); was %s for %s, %s to end of hour' % (id, name, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour)))
def pprint_job_flow(jf): """Print a job flow to stdout in this form:: job.flow.name j-JOB_FLOW_ID: 2 instances (master=m1.small, slaves=m1.small, 20 \ minutes to the hour) """ instance_count = int(jf.instancecount) nosep_segments = [ '%d instance' % instance_count, ] if instance_count > 1: nosep_segments.append('s') comma_segments = [ 'master=%s' % jf.masterinstancetype, ] if instance_count > 1: comma_segments.append('slaves=%s' % jf.slaveinstancetype) comma_segments.append('%s to end of hour' % strip_microseconds(est_time_to_hour(jf))) nosep_segments += [ ' (', ', '.join(comma_segments), ')', ] print '%s: %s' % (jf.jobflowid, jf.name) print ''.join(nosep_segments) print jf.state print
def pprint_job_flow(jf): """Print a job flow to stdout in this form:: job.flow.name j-JOB_FLOW_ID: 2 instances (master=m1.small, slaves=m1.small, 20 \ minutes to the hour) """ instance_count = int(jf.instancecount) nosep_segments = [ '%d instance' % instance_count, ] if instance_count > 1: nosep_segments.append('s') comma_segments = [ 'master=%s' % jf.masterinstancetype, ] if instance_count > 1: comma_segments.append('slaves=%s' % jf.slaveinstancetype) comma_segments.append('%s to end of hour' % strip_microseconds(est_time_to_hour(jf))) nosep_segments += [ ' (', ', '.join(comma_segments), ')', ] print('%s: %s' % (jf.jobflowid, jf.name)) print(''.join(nosep_segments)) print(jf.state) print()
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps, is_pending, time_idle, dry_run=False, max_mins_locked=None, quiet=False): fmt = ('Terminated cluster %s (%s); was %s for %s') msg = fmt % (cluster_id, cluster_name, 'pending' if is_pending else 'idle', strip_microseconds(time_idle)) did_terminate = False if dry_run: did_terminate = True else: acquired_lock = runner._attempt_to_lock_cluster(cluster_id) if acquired_lock: runner.make_emr_client().terminate_job_flows( JobFlowIds=[cluster_id]) did_terminate = True runner._release_any_cluster_lock_held() elif not quiet: log.info('%s was locked between getting cluster info and' ' trying to terminate it; skipping' % cluster_id) if did_terminate and not quiet: print(msg)
def _format_timedelta(time): """Format a timedelta for use in a columnar format. This just tweaks stuff like ``'3 days, 9:00:00'`` to line up with ``'3 days, 10:00:00'`` """ result = str(strip_microseconds(time)) parts = result.split() if len(parts) == 3 and len(parts[-1]) == 7: return '%s %s %s' % tuple(parts) else: return result
def _terminate_and_notify(runner, cluster_id, cluster_name, num_steps, is_pending, time_idle, dry_run=False, quiet=False): emr_client = runner.make_emr_client() if not dry_run: emr_client.terminate_job_flows(JobFlowIds=[cluster_id]) if not quiet: fmt = ('Terminated cluster %s (%s); was %s for %s') msg = fmt % ( cluster_id, cluster_name, 'pending' if is_pending else 'idle', strip_microseconds(time_idle)) print(msg)
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_path=conf_path, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug( 'Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % ( num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`clusters_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() s = stats if not s['clusters']: print('No job flows created in the past two months!') return print('Total # of Job Flows: %d' % len(s['clusters'])) print() print('* All times are in UTC.') print() print('Min create time: %s' % min(cs['created'] for cs in s['clusters'])) print('Max create time: %s' % max(cs['created'] for cs in s['clusters'])) print(' Current time: %s' % now.replace(microsecond=0)) print() print('* All usage is measured in Normalized Instance Hours, which are') print(' roughly equivalent to running an m1.medium instance for an hour.') print(" Billing is estimated, and may not match Amazon's system exactly.") print() # total compute-unit hours used def with_pct(usage): return (usage, percent(usage, s['nih_billed'])) print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed'])) print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used'])) print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used'])) print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used'])) print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu'])) print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu'])) print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu'])) print() if s['date_to_nih_billed']: print('Daily statistics:') print() print(' date billed used waste % waste') d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print(' %10s %9.2f %9.2f %9.2f %5.1f' % ( d, s['date_to_nih_billed'].get(d, 0.0), s['date_to_nih_used'].get(d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'].get(d, 0.0)))) d -= timedelta(days=1) print() if s['hour_to_nih_billed']: print('Hourly statistics:') print() print(' hour billed used waste % waste') h = max(s['hour_to_nih_billed']) while h >= min(s['hour_to_nih_billed']): print(' %13s %9.2f %9.2f %9.2f %5.1f' % ( h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get(h, 0.0), s['hour_to_nih_used'].get(h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0), percent(s['hour_to_nih_bbnu'].get(h, 0.0), s['hour_to_nih_billed'].get(h, 0.0)))) h -= timedelta(hours=1) print() print('* Job flows are considered to belong to the user and job that') print(' started them or last ran on them.') print() # Top jobs print('Top jobs, by total time used:') for label, nih_used in sorted(s['label_to_nih_used'].items(), key=lambda lb_nih: (-lb_nih[1], lb_nih[0])): print(' %9.2f %s' % (nih_used, label)) print() print('Top jobs, by time billed but not used:') for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(), key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])): print(' %9.2f %s' % (nih_bbnu, label)) print() # Top users print('Top users, by total time used:') for owner, nih_used in sorted(s['owner_to_nih_used'].items(), key=lambda o_nih: (-o_nih[1], o_nih[0])): print(' %9.2f %s' % (nih_used, owner)) print() print('Top users, by time billed but not used:') for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(), key=lambda o_nih2: (-o_nih2[1], o_nih2[0])): print(' %9.2f %s' % (nih_bbnu, owner)) print() # Top job steps print('Top job steps, by total time used (step number first):') for (label, step_num), nih_used in sorted( s['job_step_to_nih_used'].items(), key=lambda k_nih: (-k_nih[1], k_nih[0])): if label: print(' %9.2f %3d %s' % (nih_used, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_used,)) print() print('Top job steps, by total time billed but not used (un-pooled only):') for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].items(), key=lambda k_nih3: (-k_nih3[1], k_nih3[0])): if label: print(' %9.2f %3d %s' % (nih_bbnu, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_bbnu,)) print() # Top pools print('All pools, by total time billed:') for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(), key=lambda p_nih: (-p_nih[1], p_nih[0])): print(' %9.2f %s' % (nih_billed, pool or '(not pooled)')) print() print('All pools, by total time billed but not used:') for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(), key=lambda p_nih4: (-p_nih4[1], p_nih4[0])): print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)')) print() # Top job flows print('All job flows, by total time billed:') top_clusters = sorted(s['clusters'], key=lambda cs: (-cs['nih_billed'], cs['name'])) for cs in top_clusters: print(' %9.2f %-15s %s' % ( cs['nih_billed'], cs['id'], cs['name'])) print() print('All job flows, by time billed but not used:') top_clusters_bbnu = sorted( s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name'])) for cs in top_clusters_bbnu: print(' %9.2f %-15s %s' % ( cs['nih_bbnu'], cs['id'], cs['name'])) print() # Details print('Details for all job flows:') print() print(' id state created steps' ' time ran billed waste user name') all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'], reverse=True) for cs in all_clusters: print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % ( cs['id'], cs['state'], cs['created'], cs['num_steps'], strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'], (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
def print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`job_flows_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() s = stats if not s['flows']: print 'No job flows created in the past two months!' return print 'Total # of Job Flows: %d' % len(s['flows']) print print '* All times are in UTC.' print print 'Min create time: %s' % min(jf['created'] for jf in s['flows']) print 'Max create time: %s' % max(jf['created'] for jf in s['flows']) print ' Current time: %s' % now.replace(microsecond=0) print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print " Billing is estimated, and may not match Amazon's system exactly." print # total compute-unit hours used def with_pct(usage): return (usage, percent(usage, s['nih_billed'])) print 'Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed']) print ' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used']) print ' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used']) print ' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used']) print ' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu']) print ' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu']) print ' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu']) print if s['date_to_nih_billed']: print 'Daily statistics:' print print ' date billed used waste % waste' d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print ' %10s %9.2f %9.2f %9.2f %5.1f' % ( d, s['date_to_nih_billed'][d], s['date_to_nih_used'].get( d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'][d])) d -= timedelta(days=1) print print '* Job flows are considered to belong to the user and job that' print ' started them or last ran on them.' print # Top jobs print 'Top jobs, by total time used:' for label, nih_used in sorted(s['label_to_nih_used'].iteritems(), key=lambda (lb, nih): (-nih, lb)): print ' %9.2f %s' % (nih_used, label) print print 'Top jobs, by time billed but not used:' for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].iteritems(), key=lambda (lb, nih): (-nih, lb)): print ' %9.2f %s' % (nih_bbnu, label) print # Top users print 'Top users, by total time used:' for owner, nih_used in sorted(s['owner_to_nih_used'].iteritems(), key=lambda (o, nih): (-nih, o)): print ' %9.2f %s' % (nih_used, owner) print print 'Top users, by time billed but not used:' for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].iteritems(), key=lambda (o, nih): (-nih, o)): print ' %9.2f %s' % (nih_bbnu, owner) print # Top job steps print 'Top job steps, by total time used (step number first):' for (label, step_num), nih_used in sorted(s['job_step_to_nih_used'].iteritems(), key=lambda (k, nih): (-nih, k)): if label: print ' %9.2f %3d %s' % (nih_used, step_num, label) else: print ' %9.2f (non-mrjob step)' % (nih_used, ) print print 'Top job steps, by total time billed but not used (un-pooled only):' for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].iteritems(), key=lambda (k, nih): (-nih, k)): if label: print ' %9.2f %3d %s' % (nih_bbnu, step_num, label) else: print ' %9.2f (non-mrjob step)' % (nih_bbnu, ) print # Top pools print 'All pools, by total time billed:' for pool, nih_billed in sorted(s['pool_to_nih_billed'].iteritems(), key=lambda (p, nih): (-nih, p)): print ' %9.2f %s' % (nih_billed, pool or '(not pooled)') print print 'All pools, by total time billed but not used:' for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].iteritems(), key=lambda (p, nih): (-nih, p)): print ' %9.2f %s' % (nih_bbnu, pool or '(not pooled)') print # Top job flows print 'All job flows, by total time billed:' top_job_flows = sorted(s['flows'], key=lambda jf: (-jf['nih_billed'], jf['name'])) for jf in top_job_flows: print ' %9.2f %-15s %s' % (jf['nih_billed'], jf['id'], jf['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(s['flows'], key=lambda jf: (-jf['nih_bbnu'], jf['name'])) for jf in top_job_flows_bbnu: print ' %9.2f %-15s %s' % (jf['nih_bbnu'], jf['id'], jf['name']) print # Details print 'Details for all job flows:' print print( ' id state created steps' ' time ran billed waste user name') all_job_flows = sorted(s['flows'], key=lambda jf: jf['created'], reverse=True) for jf in all_job_flows: print ' %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s' % ( jf['id'], jf['state'], jf['created'], jf['num_steps'], strip_microseconds(jf['ran']), jf['nih_used'], jf['nih_bbnu'], (jf['owner'] or ''), (jf['label'] or ('not started by mrjob')))
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now) pool = job_flow_pool_name(jf) log.debug( 'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue if (mins_to_end_of_hour is not None and time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append( (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour)) log.info('Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = _list_all_steps(emr_conn, cluster_id) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list(_yield_all_bootstrap_actions( emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`_clusters_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() s = stats if not s['clusters']: print('No clusters created in the past two months!') return print('Total # of Clusters: %d' % len(s['clusters'])) print() print('* All times are in UTC.') print() print('Min create time: %s' % min(cs['created'] for cs in s['clusters'])) print('Max create time: %s' % max(cs['created'] for cs in s['clusters'])) print(' Current time: %s' % now.replace(microsecond=0)) print() print('* All usage is measured in Normalized Instance Hours, which are') print(' roughly equivalent to running an m1.medium instance for an hour.') print(" Billing is estimated, and may not match Amazon's system exactly.") print() # total compute-unit hours used def with_pct(usage): return (usage, _percent(usage, s['nih_billed'])) print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed'])) print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used'])) print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used'])) print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used'])) print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu'])) print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu'])) print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu'])) print() if s['date_to_nih_billed']: print('Daily statistics:') print() print(' date billed used waste % waste') d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print(' %10s %9.2f %9.2f %9.2f %5.1f' % (d, s['date_to_nih_billed'].get( d, 0.0), s['date_to_nih_used'].get( d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), _percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'].get(d, 0.0)))) d -= timedelta(days=1) print() if s['hour_to_nih_billed']: print('Hourly statistics:') print() print(' hour billed used waste % waste') h = max(s['hour_to_nih_billed']) while h >= min(s['hour_to_nih_billed']): print(' %13s %9.2f %9.2f %9.2f %5.1f' % (h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get( h, 0.0), s['hour_to_nih_used'].get( h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0), _percent(s['hour_to_nih_bbnu'].get(h, 0.0), s['hour_to_nih_billed'].get(h, 0.0)))) h -= timedelta(hours=1) print() print('* clusters are considered to belong to the user and job that') print(' started them or last ran on them.') print() # Top jobs print('Top jobs, by total time used:') for label, nih_used in sorted(s['label_to_nih_used'].items(), key=lambda lb_nih: (-lb_nih[1], lb_nih[0])): print(' %9.2f %s' % (nih_used, label)) print() print('Top jobs, by time billed but not used:') for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(), key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])): print(' %9.2f %s' % (nih_bbnu, label)) print() # Top users print('Top users, by total time used:') for owner, nih_used in sorted(s['owner_to_nih_used'].items(), key=lambda o_nih: (-o_nih[1], o_nih[0])): print(' %9.2f %s' % (nih_used, owner)) print() print('Top users, by time billed but not used:') for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(), key=lambda o_nih2: (-o_nih2[1], o_nih2[0])): print(' %9.2f %s' % (nih_bbnu, owner)) print() # Top job steps print('Top job steps, by total time used (step number first):') for (label, step_num), nih_used in sorted(s['job_step_to_nih_used'].items(), key=lambda k_nih: (-k_nih[1], k_nih[0])): if label: print(' %9.2f %3d %s' % (nih_used, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_used, )) print() print('Top job steps, by total time billed but not used (un-pooled only):') for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].items(), key=lambda k_nih3: (-k_nih3[1], k_nih3[0])): if label: print(' %9.2f %3d %s' % (nih_bbnu, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_bbnu, )) print() # Top pools print('All pools, by total time billed:') for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(), key=lambda p_nih: (-p_nih[1], p_nih[0])): print(' %9.2f %s' % (nih_billed, pool or '(not pooled)')) print() print('All pools, by total time billed but not used:') for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(), key=lambda p_nih4: (-p_nih4[1], p_nih4[0])): print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)')) print() # Top clusters print('All clusters, by total time billed:') top_clusters = sorted(s['clusters'], key=lambda cs: (-cs['nih_billed'], cs['name'])) for cs in top_clusters: print(' %9.2f %-15s %s' % (cs['nih_billed'], cs['id'], cs['name'])) print() print('All clusters, by time billed but not used:') top_clusters_bbnu = sorted(s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name'])) for cs in top_clusters_bbnu: print(' %9.2f %-15s %s' % (cs['nih_bbnu'], cs['id'], cs['name'])) print() # Details print('Details for all clusters:') print() print(' id state created steps' ' time ran billed waste user name') all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'], reverse=True) for cs in all_clusters: print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % (cs['id'], cs['state'], cs['created'], cs['num_steps'], strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'], (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
def inspect_and_maybe_terminate_job_flows(conf_paths=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(conf_paths=conf_paths, **kwargs) emr_conn = runner.make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # check if job flow is bootstrapping elif is_job_flow_bootstrapping(jf): num_bootstrapping += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif not is_job_flow_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: time_idle = now - time_last_active(jf) time_to_end_of_hour = est_time_to_hour(jf, now=now) _, pool = pool_hash_and_name(jf) pending = job_flow_has_pending_steps(jf) if pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, 'pending' if pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append((jf, pending, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d bootstrapping, %d running, %d pending, %d idle,' ' %d active non-streaming, %d done' % (num_running, num_bootstrapping, num_pending, num_idle, num_non_streaming, num_done)) terminate_and_notify(runner, to_terminate, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet)
def print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`job_flows_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() s = stats if not s["flows"]: print "No job flows created in the past two months!" return print "Total # of Job Flows: %d" % len(s["flows"]) print print "* All times are in UTC." print print "Min create time: %s" % min(jf["created"] for jf in s["flows"]) print "Max create time: %s" % max(jf["created"] for jf in s["flows"]) print " Current time: %s" % now.replace(microsecond=0) print print "* All usage is measured in Normalized Instance Hours, which are" print " roughly equivalent to running an m1.small instance for an hour." print " Billing is estimated, and may not match Amazon's system exactly." print # total compute-unit hours used def with_pct(usage): return (usage, percent(usage, s["nih_billed"])) print "Total billed: %9.2f %5.1f%%" % with_pct(s["nih_billed"]) print " Total used: %9.2f %5.1f%%" % with_pct(s["nih_used"]) print " bootstrap: %9.2f %5.1f%%" % with_pct(s["bootstrap_nih_used"]) print " jobs: %9.2f %5.1f%%" % with_pct(s["job_nih_used"]) print " Total waste: %9.2f %5.1f%%" % with_pct(s["nih_bbnu"]) print " at end: %9.2f %5.1f%%" % with_pct(s["end_nih_bbnu"]) print " other: %9.2f %5.1f%%" % with_pct(s["other_nih_bbnu"]) print if s["date_to_nih_billed"]: print "Daily statistics:" print print " date billed used waste % waste" d = max(s["date_to_nih_billed"]) while d >= min(s["date_to_nih_billed"]): print " %10s %9.2f %9.2f %9.2f %5.1f" % ( d, s["date_to_nih_billed"].get(d, 0.0), s["date_to_nih_used"].get(d, 0.0), s["date_to_nih_bbnu"].get(d, 0.0), percent(s["date_to_nih_bbnu"].get(d, 0.0), s["date_to_nih_billed"].get(d, 0.0)), ) d -= timedelta(days=1) print if s["hour_to_nih_billed"]: print "Hourly statistics:" print print " hour billed used waste % waste" h = max(s["hour_to_nih_billed"]) while h >= min(s["hour_to_nih_billed"]): print " %13s %9.2f %9.2f %9.2f %5.1f" % ( h.strftime("%Y-%m-%d %H"), s["hour_to_nih_billed"].get(h, 0.0), s["hour_to_nih_used"].get(h, 0.0), s["hour_to_nih_bbnu"].get(h, 0.0), percent(s["hour_to_nih_bbnu"].get(h, 0.0), s["hour_to_nih_billed"].get(h, 0.0)), ) h -= timedelta(hours=1) print print "* Job flows are considered to belong to the user and job that" print " started them or last ran on them." print # Top jobs print "Top jobs, by total time used:" for label, nih_used in sorted(s["label_to_nih_used"].iteritems(), key=lambda (lb, nih): (-nih, lb)): print " %9.2f %s" % (nih_used, label) print print "Top jobs, by time billed but not used:" for label, nih_bbnu in sorted(s["label_to_nih_bbnu"].iteritems(), key=lambda (lb, nih): (-nih, lb)): print " %9.2f %s" % (nih_bbnu, label) print # Top users print "Top users, by total time used:" for owner, nih_used in sorted(s["owner_to_nih_used"].iteritems(), key=lambda (o, nih): (-nih, o)): print " %9.2f %s" % (nih_used, owner) print print "Top users, by time billed but not used:" for owner, nih_bbnu in sorted(s["owner_to_nih_bbnu"].iteritems(), key=lambda (o, nih): (-nih, o)): print " %9.2f %s" % (nih_bbnu, owner) print # Top job steps print "Top job steps, by total time used (step number first):" for (label, step_num), nih_used in sorted(s["job_step_to_nih_used"].iteritems(), key=lambda (k, nih): (-nih, k)): if label: print " %9.2f %3d %s" % (nih_used, step_num, label) else: print " %9.2f (non-mrjob step)" % (nih_used,) print print "Top job steps, by total time billed but not used (un-pooled only):" for (label, step_num), nih_bbnu in sorted( s["job_step_to_nih_bbnu_no_pool"].iteritems(), key=lambda (k, nih): (-nih, k) ): if label: print " %9.2f %3d %s" % (nih_bbnu, step_num, label) else: print " %9.2f (non-mrjob step)" % (nih_bbnu,) print # Top pools print "All pools, by total time billed:" for pool, nih_billed in sorted(s["pool_to_nih_billed"].iteritems(), key=lambda (p, nih): (-nih, p)): print " %9.2f %s" % (nih_billed, pool or "(not pooled)") print print "All pools, by total time billed but not used:" for pool, nih_bbnu in sorted(s["pool_to_nih_bbnu"].iteritems(), key=lambda (p, nih): (-nih, p)): print " %9.2f %s" % (nih_bbnu, pool or "(not pooled)") print # Top job flows print "All job flows, by total time billed:" top_job_flows = sorted(s["flows"], key=lambda jf: (-jf["nih_billed"], jf["name"])) for jf in top_job_flows: print " %9.2f %-15s %s" % (jf["nih_billed"], jf["id"], jf["name"]) print print "All job flows, by time billed but not used:" top_job_flows_bbnu = sorted(s["flows"], key=lambda jf: (-jf["nih_bbnu"], jf["name"])) for jf in top_job_flows_bbnu: print " %9.2f %-15s %s" % (jf["nih_bbnu"], jf["id"], jf["name"]) print # Details print "Details for all job flows:" print print ( " id state created steps" " time ran billed waste user name" ) all_job_flows = sorted(s["flows"], key=lambda jf: jf["created"], reverse=True) for jf in all_job_flows: print " %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s" % ( jf["id"], jf["state"], jf["created"], jf["num_steps"], strip_microseconds(jf["ran"]), jf["nih_used"], jf["nih_bbnu"], (jf["owner"] or ""), (jf["label"] or ("not started by mrjob")), )
def inspect_and_maybe_terminate_job_flows( conf_path=None, dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, ): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE emr_conn = EMRJobRunner(conf_path=conf_path).make_emr_conn() log.info( 'getting info about all job flows (this goes back about 2 months)') # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. job_flows = describe_all_job_flows(emr_conn) num_running = 0 num_idle = 0 num_done = 0 num_non_streaming = 0 # a list of tuples of job flow id, name, idle time (as a timedelta) to_terminate = [] for jf in job_flows: # check if job flow is done if is_job_flow_done(jf): num_done += 1 # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) elif is_job_flow_non_streaming(jf): num_non_streaming += 1 elif is_job_flow_running(jf): num_running += 1 else: num_idle += 1 time_idle = time_job_flow_idle(jf, now=now) time_to_end_of_hour = time_to_end_of_hour_for_job_flow(jf, now=now) pool = job_flow_pool_name(jf) log.debug( 'Job flow %-15s idle for %s, %s to end of hour, %s (%s)' % (jf.jobflowid, strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), jf.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue if (mins_to_end_of_hour is not None and time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue to_terminate.append( (jf.jobflowid, jf.name, time_idle, time_to_end_of_hour)) log.info( 'Job flow statuses: %d running, %d idle, %d active non-streaming,' ' %d done' % (num_running, num_idle, num_non_streaming, num_done)) terminate_and_notify(emr_conn, to_terminate, dry_run=dry_run)
def _round_up_to_next_second(td): """Round up to the next second because that's how EMR bills.""" if td.microseconds: return strip_microseconds(td) + timedelta(seconds=1) else: return td
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # include RUNNING to catch clusters with PENDING jobs that # never ran (see #365). for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['WAITING', 'RUNNING']): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s (%s) - %s' % ( cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'], 'protected' if cluster['TerminationProtected'] else 'unprotected', )) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue if cluster['TerminationProtected']: continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = _list_all_steps(emr_conn, cluster_id) # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) if _is_cluster_non_streaming(steps): num_non_streaming += 1 continue if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d active non-streaming, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_non_streaming, num_done))
def print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`job_flows_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = datetime.utcnow() s = stats if not s['flows']: print 'No job flows created in the past two months!' return print 'Total # of Job Flows: %d' % len(s['flows']) print print '* All times are in UTC.' print print 'Min create time: %s' % min(jf['created'] for jf in s['flows']) print 'Max create time: %s' % max(jf['created'] for jf in s['flows']) print ' Current time: %s' % now.replace(microsecond=0) print print '* All usage is measured in Normalized Instance Hours, which are' print ' roughly equivalent to running an m1.small instance for an hour.' print " Billing is estimated, and may not match Amazon's system exactly." print # total compute-unit hours used def with_pct(usage): return (usage, percent(usage, s['nih_billed'])) print 'Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed']) print ' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used']) print ' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used']) print ' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used']) print ' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu']) print ' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu']) print ' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu']) print if s['date_to_nih_billed']: print 'Daily statistics:' print print ' date billed used waste % waste' d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print ' %10s %9.2f %9.2f %9.2f %5.1f' % ( d, s['date_to_nih_billed'][d], s['date_to_nih_used'].get(d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'][d])) d -= timedelta(days=1) print print '* Job flows are considered to belong to the user and job that' print ' started them or last ran on them.' print # Top jobs print 'Top jobs, by total time used:' for label, nih_used in sorted(s['label_to_nih_used'].iteritems(), key=lambda (lb, nih): (-nih, lb)): print ' %9.2f %s' % (nih_used, label) print print 'Top jobs, by time billed but not used:' for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].iteritems(), key=lambda (lb, nih): (-nih, lb)): print ' %9.2f %s' % (nih_bbnu, label) print # Top users print 'Top users, by total time used:' for owner, nih_used in sorted(s['owner_to_nih_used'].iteritems(), key=lambda (o, nih): (-nih, o)): print ' %9.2f %s' % (nih_used, owner) print print 'Top users, by time billed but not used:' for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].iteritems(), key=lambda (o, nih): (-nih, o)): print ' %9.2f %s' % (nih_bbnu, owner) print # Top job steps print 'Top job steps, by total time used (step number first):' for (label, step_num), nih_used in sorted( s['job_step_to_nih_used'].iteritems(), key=lambda (k, nih): (-nih, k)): if label: print ' %9.2f %3d %s' % (nih_used, step_num, label) else: print ' %9.2f (non-mrjob step)' % (nih_used,) print print 'Top job steps, by total time billed but not used (un-pooled only):' for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].iteritems(), key=lambda (k, nih): (-nih, k)): if label: print ' %9.2f %3d %s' % (nih_bbnu, step_num, label) else: print ' %9.2f (non-mrjob step)' % (nih_bbnu,) print # Top pools print 'All pools, by total time billed:' for pool, nih_billed in sorted(s['pool_to_nih_billed'].iteritems(), key=lambda (p, nih): (-nih, p)): print ' %9.2f %s' % (nih_billed, pool or '(not pooled)') print print 'All pools, by total time billed but not used:' for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].iteritems(), key=lambda (p, nih): (-nih, p)): print ' %9.2f %s' % (nih_bbnu, pool or '(not pooled)') print # Top job flows print 'All job flows, by total time billed:' top_job_flows = sorted(s['flows'], key=lambda jf: (-jf['nih_billed'], jf['name'])) for jf in top_job_flows: print ' %9.2f %-15s %s' % ( jf['nih_billed'], jf['id'], jf['name']) print print 'All job flows, by time billed but not used:' top_job_flows_bbnu = sorted(s['flows'], key=lambda jf: (-jf['nih_bbnu'], jf['name'])) for jf in top_job_flows_bbnu: print ' %9.2f %-15s %s' % ( jf['nih_bbnu'], jf['id'], jf['name']) print # Details print 'Details for all job flows:' print print (' id state created steps' ' time ran billed waste user name') all_job_flows = sorted(s['flows'], key=lambda jf: jf['created'], reverse=True) for jf in all_job_flows: print ' %-15s %-13s %19s %3d %17s %9.2f %9.2f %8s %s' % ( jf['id'], jf['state'], jf['created'], jf['num_steps'], strip_microseconds(jf['ran']), jf['nih_used'], jf['nih_bbnu'], (jf['owner'] or ''), (jf['label'] or ('not started by mrjob')))