def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) cluster['BootstrapActions'] = list( _boto3_paginate('BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id, _delay=_DELAY)) yield cluster
def get_or_create_mrjob_instance_profile(client): """Look for a usable instance profile for EMR, and if there is none, create one.""" # look for matching instance profile. Must point to a role with # the right policy document and attached role policy for profile in _boto3_paginate( 'InstanceProfiles', client, 'list_instance_profiles'): roles = profile['Roles'] if len(roles) != 1: continue if _role_matches(client, roles[0], _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN): return profile['InstanceProfileName'] # create a new role, and wrap it in an instance profile # with the same name name = _create_mrjob_role_with_attached_policy( client, _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN) client.create_instance_profile(InstanceProfileName=name) client.add_role_to_instance_profile(InstanceProfileName=name, RoleName=name) log.info('Auto-created instance profile %s' % name) return name
def main(args=None): now = _boto3_now() arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) if not options.exclude: filtered_cluster_summaries = cluster_summaries else: filtered_cluster_summaries = _filter_clusters( cluster_summaries, emr_client, options.exclude) job_info = _find_long_running_jobs( emr_client, filtered_cluster_summaries, min_time, now=now) _print_report(job_info)
def main(args=None): now = _boto3_now() arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) if not options.exclude: filtered_cluster_summaries = cluster_summaries else: filtered_cluster_summaries = _filter_clusters( cluster_summaries, emr_client, options.exclude) job_info = _find_long_running_jobs( emr_client, filtered_cluster_summaries, min_time, now=now) _print_report(job_info)
def get_or_create_mrjob_instance_profile(client): """Look for a usable instance profile for EMR, and if there is none, create one.""" # look for matching instance profile. Must point to a role with # the right policy document and attached role policy for profile in _boto3_paginate( 'InstanceProfiles', client, 'list_instance_profiles'): roles = profile['Roles'] if len(roles) != 1: continue if _role_matches(client, roles[0], _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN): return profile['InstanceProfileName'] # create a new role, and wrap it in an instance profile # with the same name name = _create_mrjob_role_with_attached_policy( client, _MRJOB_INSTANCE_PROFILE_ROLE, _EMR_INSTANCE_PROFILE_POLICY_ARN) client.create_instance_profile(InstanceProfileName=name) client.add_role_to_instance_profile(InstanceProfileName=name, RoleName=name) log.info('Auto-created instance profile %s' % name) return name
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) cluster['BootstrapActions'] = list(_boto3_paginate( 'BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id, _delay=_DELAY)) yield cluster
def _get_step(emr_client, cluster_id, step_id=None): # just iterate backwards through steps, rather than filtering # by step ID or status. usually it'll be the last step anyhow for step in _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id): if _step_matches(step, step_id=step_id): return step else: if step_id: log.error('step %s not found on cluster %s' % (step_id, cluster_id)) else: log.error('cluster %s has no failed steps' % cluster_id)
def _get_step(emr_client, cluster_id, step_id=None): # just iterate backwards through steps, rather than filtering # by step ID or status. usually it'll be the last step anyhow for step in _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id): if _step_matches(step, step_id=step_id): return step else: if step_id: log.error('step %s not found on cluster %s' % (step_id, cluster_id)) else: log.error('cluster %s has no failed steps' % cluster_id)
def get_or_create_mrjob_service_role(client): """Look for a usable service role for EMR, and if there is none, create one. Either way, return that role's name.""" # look for matching role. Must have same policy document # and attached role policy for role in _boto3_paginate('Roles', client, 'list_roles'): if _role_matches(client, role, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN): return role['RoleName'] # no matches, create it ourselves role_name = _create_mrjob_role_with_attached_policy( client, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN) log.info('Auto-created service role %s' % role_name) return role_name
def get_or_create_mrjob_service_role(client): """Look for a usable service role for EMR, and if there is none, create one. Either way, return that role's name.""" # look for matching role. Must have same policy document # and attached role policy for role in _boto3_paginate('Roles', client, 'list_roles'): if _role_matches(client, role, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN): return role['RoleName'] # no matches, create it ourselves role_name = _create_mrjob_role_with_attached_policy( client, _MRJOB_SERVICE_ROLE, _EMR_SERVICE_ROLE_POLICY_ARN) log.info('Auto-created service role %s' % role_name) return role_name
def test_retry_during_pagination(self): # regression test for #2005 bucket_names = ['walrus%02d' % i for i in range(100)] # must set side_effect before adding error self.list_buckets.side_effect = [dict(Buckets=bucket_names)] self.add_transient_error(socket.error(110, 'Connection timed out')) # our mock pagination somewhat messes with this test; rather than # getting called once per page of bucket names, list_buckets() only # gets called twice, once to fail with a transient error, and once to # get the full list of buckets, which mock pagination then breaks # into "pages". This still tests the important thing though, which is # that we can retry at all within pagination self.assertEqual( list( _boto3_paginate('Buckets', self.wrapped_client, 'list_buckets')), bucket_names)
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:mod:`boto3` cluster summary data structures :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = _boto3_now() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created = cs['Status']['Timeline']['CreationDateTime'] time_running = now - created if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': cs['Name'], 'state': cs['Status']['State'], 'time': time_running}) # the default case: running clusters if cs['Status']['State'] != 'RUNNING': continue steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cs['Id'])))) running_steps = [ step for step in steps if step['Status']['State'] == 'RUNNING'] pending_steps = [ step for step in steps if step['Status']['State'] == 'PENDING'] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start = step['Status']['Timeline']['StartDateTime'] time_running = now - start if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_running}) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start = cs['Status']['Timeline']['ReadyDateTime'] for step in steps: if step['Status']['State'] == 'COMPLETED': start = step['Status']['Timeline']['EndDateTime'] time_pending = now - start if time_pending >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_pending})
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:mod:`boto3` cluster summary data structures :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = _boto3_now() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created = cs['Status']['Timeline']['CreationDateTime'] time_running = now - created if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': cs['Name'], 'state': cs['Status']['State'], 'time': time_running}) # the default case: running clusters if cs['Status']['State'] != 'RUNNING': continue steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cs['Id'])))) running_steps = [ step for step in steps if step['Status']['State'] == 'RUNNING'] pending_steps = [ step for step in steps if step['Status']['State'] == 'PENDING'] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start = step['Status']['Timeline']['StartDateTime'] time_running = now - start if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_running}) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start = cs['Status']['Timeline']['ReadyDateTime'] for step in steps: if step['Status']['State'] == 'COMPLETED': start = step['Status']['Timeline']['EndDateTime'] time_pending = now - start if time_pending >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_pending})
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list( _boto3_paginate('BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # include RUNNING to catch clusters with PENDING jobs that # never ran (see #365). for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['WAITING', 'RUNNING']): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s (%s) - %s' % ( cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'], 'protected' if cluster['TerminationProtected'] else 'unprotected', )) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue if cluster['TerminationProtected']: continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _attempt_to_lock_cluster(emr_client, cluster_id, job_key, cluster=None, when_cluster_described=None): """Attempt to lock the given pooled cluster using EMR tags. You may optionally include *cluster* (a cluster description) and *when_cluster_described*, to save an API call to ``DescribeCluster`` If the cluster's StepConcurrency Level is 1, locking considers the cluster available if it's in the WAITING state. this means we should not release our lock until our step(s) have started running, which can take several seconds. Otherwise, steps can run concurrently, so locking considers the cluster available if it's in the WAITING or RUNNING state. Additionally, it makes a ``ListSteps`` API call to verify that the cluster doesn't already have as many active steps as it can run simultaneously. Because other jobs looking to join the cluster will also count steps, we can release our lock as soon as we add our steps. """ log.debug('Attempting to lock cluster %s for %.1f seconds' % (cluster_id, _CLUSTER_LOCK_SECS)) if cluster is None: cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] if when_cluster_described is None: start = time.time() else: start = when_cluster_described if cluster['StepConcurrencyLevel'] == 1: step_accepting_states = ['WAITING'] else: step_accepting_states = ['RUNNING', 'WAITING'] # check if there is a non-expired lock state = cluster['Status']['State'] if state not in step_accepting_states: # this could happen if the cluster were TERMINATING, for example log.info(' cluster is not accepting steps, state is %s' % state) return False lock = _get_cluster_lock(cluster) if lock: expiry = None try: their_job_key, expiry = _parse_cluster_lock(lock) except ValueError: log.info(' ignoring invalid pool lock: %s' % lock) if expiry and expiry > start: log.info(' locked by %s for %.1f seconds' % (their_job_key, expiry - start)) return False # add our lock our_lock = _make_cluster_lock(job_key, start + _CLUSTER_LOCK_SECS) log.debug(' adding tag to cluster %s:' % cluster_id) log.debug(' %s=%s' % (_POOL_LOCK_KEY, our_lock)) emr_client.add_tags(ResourceId=cluster_id, Tags=[dict(Key=_POOL_LOCK_KEY, Value=our_lock)]) if time.time() - start > _ADD_TAG_BEFORE: log.info(' took too long to tag cluster with lock') return False # wait, then check if our lock is still there log.info(" waiting %.1f seconds to ensure lock wasn't overwritten" % _WAIT_AFTER_ADD_TAG) time.sleep(_WAIT_AFTER_ADD_TAG) # check if our lock is still there cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] state = cluster['Status']['State'] if state not in step_accepting_states: # this could happen if the cluster were TERMINATING, for example log.info(' cluster is not accepting steps, state is %s' % state) return False if cluster['StepConcurrencyLevel'] > 1: # is cluster already full of steps? num_active_steps = len( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id, StepStates=['PENDING', 'RUNNING']))) if num_active_steps >= cluster['StepConcurrencyLevel']: log.info(' cluster already has %d active steps' % num_active_steps) return lock = _get_cluster_lock(cluster) if lock is None: log.info(' lock was removed') return False elif lock != our_lock: their_job_desc = 'other job' try: their_job_desc, expiry = _parse_cluster_lock(lock) except ValueError: pass log.info(' lock was overwritten by %s' % their_job_desc) return False # make sure we have enough time to add steps and have them run # before the lock expires if time.time() > start + _CHECK_TAG_BEFORE: log.info(' took too long to check for lock') return False log.info(' lock acquired') return True