def yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant job flow information from EMR. :param float max_days_ago: If set, don't fetch job flows created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = datetime.utcnow() emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) for cluster_summary in _yield_all_clusters( emr_conn, created_after=created_after): cluster_id = cluster_summary.id cluster = patched_describe_cluster(emr_conn, cluster_id) cluster.steps = list(_yield_all_steps(emr_conn, cluster_id)) cluster.bootstrapactions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) yield cluster
def yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = datetime.utcnow() emr_conn = EMRJobRunner(**runner_kwargs).make_emr_conn() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) for cluster_summary in _yield_all_clusters( emr_conn, created_after=created_after): cluster_id = cluster_summary.id cluster = patched_describe_cluster(emr_conn, cluster_id) cluster.steps = list(_yield_all_steps(emr_conn, cluster_id)) cluster.bootstrapactions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) yield cluster
def _find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster` objects to inspect. :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = datetime.utcnow() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs.status.state in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created_timestamp = cs.status.timeline.creationdatetime created = iso8601_to_datetime(created_timestamp) time_running = now - created if time_running >= min_time: yield ({ 'cluster_id': cs.id, 'name': cs.name, 'state': cs.status.state, 'time': time_running }) # the default case: running clusters if cs.status.state != 'RUNNING': continue steps = list(_yield_all_steps(emr_conn, cs.id)) running_steps = [ step for step in steps if step.status.state == 'RUNNING' ] pending_steps = [ step for step in steps if step.status.state == 'PENDING' ] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start_timestamp = step.status.timeline.startdatetime start = iso8601_to_datetime(start_timestamp) time_running = now - start if time_running >= min_time: yield ({ 'cluster_id': cs.id, 'name': step.name, 'state': step.status.state, 'time': time_running }) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start_timestamp = cs.status.timeline.readydatetime for step in steps: if step.status.state == 'COMPLETED': start_timestamp = step.status.timeline.enddatetime start = iso8601_to_datetime(start_timestamp) time_pending = now - start if time_pending >= min_time: yield ({ 'cluster_id': cs.id, 'name': step.name, 'state': step.status.state, 'time': time_pending })
def maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if cluster is done if is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list(_yield_all_steps(emr_conn, cluster_id)) # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) if is_cluster_non_streaming(steps): num_non_streaming += 1 continue if any(is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = cluster_has_pending_steps(steps) bootstrap_actions = list(_yield_all_bootstrap_actions( emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta( minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d active non-streaming, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_non_streaming, num_done))
def maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = datetime.utcnow() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_conn = runner.make_emr_conn() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_non_streaming = 0 num_pending = 0 num_running = 0 # We don't filter by job flow state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _yield_all_clusters(emr_conn): cluster_id = cluster_summary.id # check if job flow is done if is_cluster_done(cluster_summary): num_done += 1 continue # check if job flow is starting if is_cluster_starting(cluster_summary): num_starting += 1 continue # check if job flow is bootstrapping if is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list(_yield_all_steps(emr_conn, cluster_id)) # we can't really tell if non-streaming jobs are idle or not, so # let them be (see Issue #60) if is_cluster_non_streaming(steps): num_non_streaming += 1 continue if any(is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = cluster_has_pending_steps(steps) bootstrap_actions = list( _yield_all_bootstrap_actions(emr_conn, cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('Job flow %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary.name)) # filter out job flows that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary.name, num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Job flow statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d active non-streaming, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_non_streaming, num_done))
def find_long_running_jobs(emr_conn, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:class:`boto.emr.emrobject.Cluster` objects to inspect. :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = datetime.utcnow() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs.status.state in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created_timestamp = cs.status.timeline.creationdatetime created = iso8601_to_datetime(created_timestamp) time_running = now - created if time_running >= min_time: yield({'cluster_id': cs.id, 'name': cs.name, 'state': cs.status.state, 'time': time_running}) # the default case: running clusters if cs.status.state != 'RUNNING': continue steps = list(_yield_all_steps(emr_conn, cs.id)) running_steps = [ step for step in steps if step.status.state == 'RUNNING'] pending_steps = [ step for step in steps if step.status.state == 'PENDING'] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start_timestamp = step.status.timeline.startdatetime start = iso8601_to_datetime(start_timestamp) time_running = now - start if time_running >= min_time: yield({'cluster_id': cs.id, 'name': step.name, 'state': step.status.state, 'time': time_running}) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start_timestamp = cs.status.timeline.readydatetime for step in steps: if step.status.state == 'COMPLETED': start_timestamp = step.status.timeline.enddatetime start = iso8601_to_datetime(start_timestamp) time_pending = now - start if time_pending >= min_time: yield({'cluster_id': cs.id, 'name': step.name, 'state': step.status.state, 'time': time_pending})