def test_now_is_automatically_set(self): cs = dict(Status=dict(Timeline=dict(CreationDateTime=_boto3_now()))) t = _est_time_to_hour(cs) self.assertLessEqual(t, timedelta(minutes=60)) self.assertGreater(t, timedelta(minutes=59))
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # max_hours_idle -> max_mins_idle max_mins_idle = options.max_mins_idle if max_mins_idle is None and options.max_hours_idle is not None: log.warning('--max-hours-idle is deprecated and will be removed' ' in v0.7.0. Please use --max-mins-idle instead.') max_mins_idle = options.max_hours_idle * 60 if options.mins_to_end_of_hour is not None: log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0' ' and does nothing') _maybe_terminate_clusters( dry_run=options.dry_run, max_mins_idle=max_mins_idle, unpooled_only=options.unpooled_only, now=_boto3_now(), pool_name=options.pool_name, pooled_only=options.pooled_only, max_mins_locked=options.max_mins_locked, quiet=options.quiet, **_runner_kwargs(options) )
def main(args=None): now = _boto3_now() arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) if not options.exclude: filtered_cluster_summaries = cluster_summaries else: filtered_cluster_summaries = _filter_clusters( cluster_summaries, emr_client, options.exclude) job_info = _find_long_running_jobs( emr_client, filtered_cluster_summaries, min_time, now=now) _print_report(job_info)
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # max_hours_idle -> max_mins_idle max_mins_idle = options.max_mins_idle if max_mins_idle is None and options.max_hours_idle is not None: log.warning('--max-hours-idle is deprecated and will be removed' ' in v0.7.0. Please use --max-mins-idle instead.') max_mins_idle = options.max_hours_idle * 60 if options.mins_to_end_of_hour is not None: log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0' ' and does nothing') _maybe_terminate_clusters(dry_run=options.dry_run, max_mins_idle=max_mins_idle, unpooled_only=options.unpooled_only, now=_boto3_now(), pool_name=options.pool_name, pooled_only=options.pooled_only, max_mins_locked=options.max_mins_locked, quiet=options.quiet, **_runner_kwargs(options))
def main(args=None): now = _boto3_now() arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) if not options.exclude: filtered_cluster_summaries = cluster_summaries else: filtered_cluster_summaries = _filter_clusters( cluster_summaries, emr_client, options.exclude) job_info = _find_long_running_jobs( emr_client, filtered_cluster_summaries, min_time, now=now) _print_report(job_info)
def upload_file(self, path, Config=None): if self.bucket_name not in self.meta.client.mock_s3_fs: # upload_file() is a higher-order operation, has fancy errors raise S3UploadFailedError( 'Failed to upload %s to %s/%s: %s' % ( path, self.bucket_name, self.key, str(_no_such_bucket_error('PutObject')))) mock_keys = self._mock_bucket_keys('PutObject') with open(path, 'rb') as f: mock_keys[self.key] = (f.read(), _boto3_now())
def upload_file(self, path, Config=None): if self.bucket_name not in self.meta.client.mock_s3_fs: # upload_file() is a higher-order operation, has fancy errors raise S3UploadFailedError( 'Failed to upload %s to %s/%s: %s' % (path, self.bucket_name, self.key, str(_no_such_bucket_error('PutObject')))) mock_keys = self._mock_bucket_keys('PutObject') with open(path, 'rb') as f: mock_keys[self.key] = (f.read(), _boto3_now())
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) cluster['BootstrapActions'] = list( _boto3_paginate('BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id, _delay=_DELAY)) yield cluster
def put(self, Body): if not isinstance(Body, bytes): raise NotImplementedError('mock put() only support bytes') mock_keys = self._mock_bucket_keys('PutObject') if isinstance(Body, bytes): data = Body elif hasattr(Body, 'read'): data = Body.read() if not isinstance(data, bytes): raise TypeError('Body or Body.read() must be bytes') mock_keys[self.key] = (data, _boto3_now())
def put(self, Body): if not isinstance(Body, bytes): raise NotImplementedError('mock put() only support bytes') mock_keys = self._mock_bucket_keys('PutObject') if isinstance(Body, bytes): data = Body elif hasattr(Body, 'read'): data = Body.read() if not isinstance(data, bytes): raise TypeError('Body or Body.read() must be bytes') mock_keys[self.key] = (data, _boto3_now())
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) _maybe_terminate_clusters(dry_run=options.dry_run, max_hours_idle=options.max_hours_idle, mins_to_end_of_hour=options.mins_to_end_of_hour, unpooled_only=options.unpooled_only, now=_boto3_now(), pool_name=options.pool_name, pooled_only=options.pooled_only, max_mins_locked=options.max_mins_locked, quiet=options.quiet, **_runner_kwargs(options))
def create_role(self, AssumeRolePolicyDocument, RoleName): # Path not supported # mock RoleIds are all the same self._check_role_does_not_exist(RoleName, 'CreateRole') role = dict( Arn=('arn:aws:iam::012345678901:role/%s' % RoleName), AssumeRolePolicyDocument=json.loads(AssumeRolePolicyDocument), CreateDate=_boto3_now(), Path='/', RoleId='AROAMOCKMOCKMOCKMOCK', RoleName=RoleName, ) self.mock_iam_roles[RoleName] = role return dict(Role=role)
def main(args=None): # parse command-line args arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) now = _boto3_now() log.info('getting cluster history...') clusters = list(_yield_clusters( max_days_ago=options.max_days_ago, now=now, **_runner_kwargs(options))) log.info('compiling cluster stats...') stats = _clusters_to_stats(clusters, now=now) _print_report(stats, now=now)
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs): """Delete all files older than *time_old* in *path*. If *dry_run* is true, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(**runner_kwargs) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path, key in runner.fs._ls(glob_path): age = _boto3_now() - key.last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (path, age)) if not dry_run: key.delete()
def _s3_cleanup(glob_path, time_old, dry_run=False, **runner_kwargs): """Delete all files older than *time_old* in *path*. If *dry_run* is true, then just log the files that need to be deleted without actually deleting them """ runner = EMRJobRunner(**runner_kwargs) log.info('Deleting all files in %s that are older than %s' % (glob_path, time_old)) for path, key in runner.fs.s3._ls(glob_path): age = _boto3_now() - key.last_modified if age > time_old: # Delete it log.info('Deleting %s; is %s old' % (path, age)) if not dry_run: key.delete()
def _yield_clusters(max_days_ago=None, now=None, **runner_kwargs): """Get relevant cluster information from EMR. :param float max_days_ago: If set, don't fetch clusters created longer than this many days ago. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. :param runner_kwargs: keyword args to pass through to :py:class:`~mrjob.emr.EMRJobRunner` """ if now is None: now = _boto3_now() emr_client = EMRJobRunner(**runner_kwargs).make_emr_client() # if --max-days-ago is set, only look at recent jobs created_after = None if max_days_ago is not None: created_after = now - timedelta(days=max_days_ago) # use _DELAY to sleep 1 second after each API call (see #1091). Could # implement some sort of connection wrapper for this if it becomes more # generally useful. list_clusters_kwargs = dict(_delay=_DELAY) if created_after is not None: list_clusters_kwargs['CreatedAfter'] = created_after for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', **list_clusters_kwargs): cluster_id = cluster_summary['Id'] cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] sleep(_DELAY) cluster['Steps'] = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id, _delay=_DELAY)))) cluster['BootstrapActions'] = list(_boto3_paginate( 'BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id, _delay=_DELAY)) yield cluster
def main(args=None): # parse command-line args arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) now = _boto3_now() log.info('getting cluster history...') clusters = list( _yield_clusters(max_days_ago=options.max_days_ago, now=now, **_runner_kwargs(options))) log.info('compiling cluster stats...') stats = _clusters_to_stats(clusters, now=now) _print_report(stats, now=now)
def create_instance_profile(self, InstanceProfileName): # Path not implemented # mock InstanceProfileIds are all the same self._check_instance_profile_does_not_exist(InstanceProfileName, 'CreateInstanceProfile') profile = dict( Arn=('arn:aws:iam::012345678901:instance-profile/%s' % InstanceProfileName), CreateDate=_boto3_now(), InstanceProfileId='AIPAMOCKMOCKMOCKMOCK', InstanceProfileName=InstanceProfileName, Path='/', Roles=[], ) self.mock_iam_instance_profiles[InstanceProfileName] = profile return dict(InstanceProfile=profile)
def _est_time_to_hour(cluster_summary, now=None): """How long before job reaches the end of the next full hour since it began. This is important for billing purposes. If it happens to be exactly a whole number of hours, we return one hour, not zero. """ if now is None: now = _boto3_now() timeline = cluster_summary.get('Status', {}).get('Timeline', {}) creationdatetime = timeline.get('CreationDateTime') if not creationdatetime: # do something reasonable if creationdatetime isn't set return timedelta(minutes=60) run_time = now - creationdatetime return timedelta(seconds=((-run_time).seconds % 3600.0 or 3600.0))
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None): """Update *mock_s3_fs* with a map from bucket name to key name to data. :param age: a timedelta :param location string: the bucket's location constraint (a region name) """ age = age or timedelta(0) time_modified = _boto3_now() - age for bucket_name, key_name_to_bytes in data.items(): bucket = mock_s3_fs.setdefault(bucket_name, {'keys': {}, 'location': ''}) for key_name, key_data in key_name_to_bytes.items(): if not isinstance(key_data, bytes): raise TypeError('mock s3 data must be bytes') bucket['keys'][key_name] = (key_data, time_modified) if location is not None: bucket['location'] = location
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None): """Update *mock_s3_fs* with a map from bucket name to key name to data. :param age: a timedelta :param location string: the bucket's location constraint (a region name) """ age = age or timedelta(0) time_modified = _boto3_now() - age for bucket_name, key_name_to_bytes in data.items(): bucket = mock_s3_fs.setdefault( bucket_name, dict(creation_date=_boto3_today(), keys={}, location='')) for key_name, key_data in key_name_to_bytes.items(): if not isinstance(key_data, bytes): raise TypeError('mock s3 data must be bytes') bucket['keys'][key_name] = (key_data, time_modified) if location is not None: bucket['location'] = location
def upload_file(self, path, Config=None): if self.bucket_name not in self.meta.client.mock_s3_fs: # upload_file() is a higher-order operation, has fancy errors raise S3UploadFailedError( 'Failed to upload %s to %s/%s: %s' % ( path, self.bucket_name, self.key, str(_no_such_bucket_error('PutObject')))) # verify that config doesn't have empty part size (see #2033) # # config is a boto3.s3.transfer.TransferConfig (we don't mock it), # which is actually part of s3transfer. Very old versions of s3transfer # (e.g. 0.10.0) disallow initializing TransferConfig with part sizes # that are zero or None if Config and not (Config.multipart_chunksize and Config.multipart_threshold): raise TypeError('part size may not be 0 or None') mock_keys = self._mock_bucket_keys('PutObject') with open(path, 'rb') as f: mock_keys[self.key] = dict( body=f.read(), time_modified=_boto3_now())
def upload_file(self, path, Config=None): if self.bucket_name not in self.meta.client.mock_s3_fs: # upload_file() is a higher-order operation, has fancy errors raise S3UploadFailedError( 'Failed to upload %s to %s/%s: %s' % (path, self.bucket_name, self.key, str(_no_such_bucket_error('PutObject')))) # verify that config doesn't have empty part size (see #2033) # # config is a boto3.s3.transfer.TransferConfig (we don't mock it), # which is actually part of s3transfer. Very old versions of s3transfer # (e.g. 0.10.0) disallow initializing TransferConfig with part sizes # that are zero or None if Config and not (Config.multipart_chunksize and Config.multipart_threshold): raise TypeError('part size may not be 0 or None') mock_keys = self._mock_bucket_keys('PutObject') with open(path, 'rb') as f: mock_keys[self.key] = dict(body=f.read(), time_modified=_boto3_now())
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None, storage_class=None, restore=None): """Update *mock_s3_fs* with a map from bucket name to key name to data. :param age: a timedelta :param location string: the bucket's location constraint (a region name) :param storage_class string: storage class for all data added :param restore: x-amz-restore header (see https://docs.aws.amazon.com/AmazonS3/latest/API/\ RESTObjectHEAD.html#RESTObjectHEAD-responses) """ age = age or timedelta(0) time_modified = _boto3_now() - age for bucket_name, key_name_to_bytes in data.items(): bucket = mock_s3_fs.setdefault( bucket_name, dict(creation_date=_boto3_today(), keys={}, location='')) for key_name, key_data in key_name_to_bytes.items(): if not isinstance(key_data, bytes): raise TypeError('mock s3 data must be bytes') mock_key = dict(body=key_data, time_modified=time_modified) if storage_class: mock_key['storage_class'] = storage_class if restore: mock_key['restore'] = restore bucket['keys'][key_name] = mock_key if location is not None: bucket['location'] = location
def add_mock_s3_data(mock_s3_fs, data, age=None, location=None, storage_class=None, restore=None): """Update *mock_s3_fs* with a map from bucket name to key name to data. :param age: a timedelta :param location string: the bucket's location constraint (a region name) :param storage_class string: storage class for all data added :param restore: x-amz-restore header (see https://docs.aws.amazon.com/AmazonS3/latest/API/\ RESTObjectHEAD.html#RESTObjectHEAD-responses) """ age = age or timedelta(0) time_modified = _boto3_now() - age for bucket_name, key_name_to_bytes in data.items(): bucket = mock_s3_fs.setdefault( bucket_name, dict(creation_date=_boto3_today(), keys={}, location='')) for key_name, key_data in key_name_to_bytes.items(): if not isinstance(key_data, bytes): raise TypeError('mock s3 data must be bytes') mock_key = dict( body=key_data, time_modified=time_modified) if storage_class: mock_key['storage_class'] = storage_class if restore: mock_key['restore'] = restore bucket['keys'][key_name] = mock_key if location is not None: bucket['location'] = location
def add_mock_ec2_image(self, image): """Add information about a mock EC2 Image (AMI) to be returned by mock :py:meth:`~tests.mock_boto3.ec2.MockEC2Client.describe_images`. This will automatically fill `CreationDate`. Other fields you might want to fill include: * ``Architecture`` (e.g. ``'i386'``, ``'x86_64'``) * ``BlockDeviceMappings`` (e.g. ``[{'DeviceName': '/dev/sda1'}]``) * ``ImageOwnerAlias`` (e.g. ``'amazon'``, ``'aws-marketplace'``) * ``Name`` (e.g. ``amzn-ami-hvm-2017.09.1.20171120-x86_64-s3``) * ``RootDeviceType`` (e.g. ``'ebs'``, ``'instance-store'``) * ``VirtualizationType (e.g. ``'hvm'``, ``'paravirtual'``) """ image = dict(image) # TODO: will eventually need to add a mock user ID to support # filtering by owner == 'self' if not image.get('CreationDate'): image['CreationDate'] = _boto3_now().strftime( '%Y-%m-%dT%H:%M:%S.%f')[:-3] + 'Z' self.mock_ec2_images.append(image)
def _cluster_to_basic_summary(cluster, now=None): """Extract fields such as creation time, owner, etc. from the cluster. :param cluster: a :py:mod:`boto3` cluster data structure :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the cluster is unavailable. * *created*: UTC `datetime.datetime` that the cluster was created, or ``None`` * *end*: UTC `datetime.datetime` that the cluster finished, or ``None`` * *id*: cluster ID, or ``None`` (this should never happen) * *label*: The label for the cluster (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *name*: cluster name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours cluster *would* use if it ran to the end of the next full hour ( * *num_steps*: Number of steps in the cluster. * *owner*: The owner for the cluster (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled, otherwise ``None``. * *ran*: How long the cluster ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the cluster hasn't started. * *ready*: UTC `datetime.datetime` that the cluster finished bootstrapping, or ``None`` * *state*: The cluster's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = _boto3_now() bcs = {} # basic cluster summary to fill in bcs['id'] = cluster['Id'] bcs['name'] = cluster['Name'] Status = cluster['Status'] Timeline = Status.get('Timeline', {}) bcs['created'] = Timeline.get('CreationDateTime') bcs['ready'] = Timeline.get('ReadyDateTime') bcs['end'] = Timeline.get('EndDateTime') if bcs['created']: bcs['ran'] = (bcs['end'] or now) - bcs['created'] else: bcs['ran'] = timedelta(0) bcs['state'] = Status.get('State') bcs['num_steps'] = len(cluster['Steps']) _, bcs['pool'] = _pool_hash_and_name(cluster) if not bcs['pool']: _, bcs['pool'] = _legacy_pool_hash_and_name( cluster['BootstrapActions']) m = _JOB_KEY_RE.match(bcs['name'] or '') if m: bcs['label'], bcs['owner'] = m.group(1), m.group(2) else: bcs['label'], bcs['owner'] = None, None bcs['nih'] = float(cluster.get('NormalizedInstanceHours', 0)) return bcs
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:mod:`boto3` cluster summary data structures :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = _boto3_now() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created = cs['Status']['Timeline']['CreationDateTime'] time_running = now - created if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': cs['Name'], 'state': cs['Status']['State'], 'time': time_running}) # the default case: running clusters if cs['Status']['State'] != 'RUNNING': continue steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cs['Id'])))) running_steps = [ step for step in steps if step['Status']['State'] == 'RUNNING'] pending_steps = [ step for step in steps if step['Status']['State'] == 'PENDING'] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start = step['Status']['Timeline']['StartDateTime'] time_running = now - start if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_running}) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start = cs['Status']['Timeline']['ReadyDateTime'] for step in steps: if step['Status']['State'] == 'COMPLETED': start = step['Status']['Timeline']['EndDateTime'] time_pending = now - start if time_pending >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_pending})
def _boto3_today(): now = _boto3_now() return datetime(now.year, now.month, now.day, tzinfo=now.tzinfo)
def create_fake_clusters(self): self.now = _boto3_now().replace(microsecond=0) self.add_mock_s3_data({'my_bucket': {}}) # create a timestamp the given number of *hours*, *minutes*, etc. # in the past def ago(**kwargs): return self.now - timedelta(**kwargs) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=self._DEFAULT_STEP_ARGS, state='COMPLETED', created=None, started=None, ended=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): timeline = dict() if created: timeline['CreationDateTime'] = created if started: timeline['StartDateTime'] = started if ended: timeline['EndDateTime'] = ended return dict(Config=dict( ActionOnFailure=action_on_failure, Args=args, Jar=jar, ), Status=dict( State=state, Timeline=timeline, )) # empty job self.add_mock_emr_cluster( dict( Id='j-EMPTY', Status=dict( State='STARTING', Timeline=dict(CreationDateTime=ago(hours=10)), ), )) # job that's bootstrapping self.add_mock_emr_cluster( dict( Id='j-BOOTSTRAPPING', Status=dict( State='BOOTSTRAPPING', Timeline=dict(CreationDateTime=ago(hours=10), ), ), _Steps=[step(created=ago(hours=10), state='PENDING')], )) # currently running job self.add_mock_emr_cluster( dict(Id='j-CURRENTLY_RUNNING', Status=dict(State='RUNNING', Timeline=dict(CreationDateTime=ago(hours=4, minutes=15), ReadyDateTime=ago(hours=4, minutes=10))), _Steps=[step(started=ago(hours=4), state='RUNNING')])) # finished cluster self.add_mock_emr_cluster( dict( Id='j-DONE', Status=dict( State='TERMINATED', Timeline=dict( CreationDateTime=ago(hours=10), ReadyDateTime=ago(hours=8), EndDateTime=ago(hours=5), ), ), _Steps=[step(started=ago(hours=8), ended=ago(hours=6))], )) # idle cluster self.add_mock_emr_cluster( dict( Id='j-DONE_AND_IDLE', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) # idle cluster with 4.x step format. should still be # recognizable as a streaming step self.add_mock_emr_cluster( dict( Id='j-DONE_AND_IDLE_4_X', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[ step(started=ago(hours=4), ended=ago(hours=2), jar='command-runner.jar', args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS) ], )) # idle cluster with an active lock self.add_mock_emr_cluster( dict( Id='j-IDLE_AND_LOCKED', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) self.add_mock_s3_data({ 'my_bucket': { 'locks/j-IDLE_AND_LOCKED/2': b'not_you', }, }) # idle cluster with an expired lock self.add_mock_emr_cluster( dict( Id='j-IDLE_AND_EXPIRED', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) self.add_mock_s3_data( { 'my_bucket': { 'locks/j-IDLE_AND_EXPIRED/2': b'not_you', }, }, age=timedelta(minutes=5)) # idle cluster with an expired lock self.add_mock_emr_cluster( dict( Id='j-IDLE_BUT_INCOMPLETE_STEPS', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), end_hours_ago=None)], )) # custom hadoop streaming jar self.add_mock_emr_cluster( dict( Id='j-CUSTOM_DONE_AND_IDLE', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[ step( started=ago(hours=4), ended=ago(hours=4), jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], ) ], )) # hadoop debugging without any other steps self.add_mock_emr_cluster( dict( Id='j-DEBUG_ONLY', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=3), ReadyDateTime=ago(hours=2, minutes=55), ), ), _Steps=[ step(jar='command-runner.jar', name='Setup Hadoop Debugging', args=['state-pusher-script'], started=ago(hours=3), ended=ago(hours=2)) ], )) # hadoop debugging + actual job self.add_mock_emr_cluster( dict( Id='j-HADOOP_DEBUGGING', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=55), ), ), _Steps=[ step(jar='command-runner.jar', name='Setup Hadoop Debugging', args=['state-pusher-script'], started=ago(hours=5), ended=ago(hours=4)), step(started=ago(hours=4), ended=ago(hours=2)), ], )) # should skip cancelled steps self.add_mock_emr_cluster( dict( Id='j-IDLE_AND_FAILED', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[ step(started=ago(hours=4), ended=ago(hours=3), state='FAILED'), step(state='CANCELLED'), ], )) # pooled cluster reaching end of full hour self.add_mock_emr_cluster( dict( _BootstrapActions=[ dict(Args=[], Name='action 0'), dict( Args=[ 'pool-0123456789abcdef0123456789abcdef', 'reflecting' ], Name='master', ), ], Id='j-POOLED', Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(minutes=55), ReadyDateTime=ago(minutes=50), ), ), )) # cluster that has had pending jobs but hasn't run them self.add_mock_emr_cluster( dict( Id='j-PENDING_BUT_IDLE', Status=dict( State='RUNNING', Timeline=dict( CreationDateTime=ago(hours=3), ReadyDateTime=ago(hours=2, minutes=50), ), ), _Steps=[step(created=ago(hours=3), state='PENDING')], ))
def _print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`_clusters_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = _boto3_now() s = stats if not s['clusters']: print('No clusters created in the past two months!') return print('Total # of Clusters: %d' % len(s['clusters'])) print() print('* All times are in UTC.') print() print('Min create time: %s' % min(cs['created'] for cs in s['clusters'])) print('Max create time: %s' % max(cs['created'] for cs in s['clusters'])) print(' Current time: %s' % now.replace(microsecond=0)) print() print('* All usage is measured in Normalized Instance Hours, which are') print(' roughly equivalent to running an m1.medium instance for an hour.') print(" Billing is estimated, and may not match Amazon's system exactly.") print() # total compute-unit hours used def with_pct(usage): return (usage, _percent(usage, s['nih_billed'])) print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed'])) print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used'])) print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used'])) print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used'])) print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu'])) print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu'])) print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu'])) print() if s['date_to_nih_billed']: print('Daily statistics:') print() print(' date billed used waste % waste') d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print(' %10s %9.2f %9.2f %9.2f %5.1f' % (d, s['date_to_nih_billed'].get( d, 0.0), s['date_to_nih_used'].get( d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), _percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'].get(d, 0.0)))) d -= timedelta(days=1) print() if s['hour_to_nih_billed']: print('Hourly statistics:') print() print(' hour billed used waste % waste') h = max(s['hour_to_nih_billed']) while h >= min(s['hour_to_nih_billed']): print(' %13s %9.2f %9.2f %9.2f %5.1f' % (h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get( h, 0.0), s['hour_to_nih_used'].get( h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0), _percent(s['hour_to_nih_bbnu'].get(h, 0.0), s['hour_to_nih_billed'].get(h, 0.0)))) h -= timedelta(hours=1) print() print('* clusters are considered to belong to the user and job that') print(' started them or last ran on them.') print() # Top jobs print('Top jobs, by total time used:') for label, nih_used in sorted(s['label_to_nih_used'].items(), key=lambda lb_nih: (-lb_nih[1], lb_nih[0])): print(' %9.2f %s' % (nih_used, label)) print() print('Top jobs, by time billed but not used:') for label, nih_bbnu in sorted(s['label_to_nih_bbnu'].items(), key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])): print(' %9.2f %s' % (nih_bbnu, label)) print() # Top users print('Top users, by total time used:') for owner, nih_used in sorted(s['owner_to_nih_used'].items(), key=lambda o_nih: (-o_nih[1], o_nih[0])): print(' %9.2f %s' % (nih_used, owner)) print() print('Top users, by time billed but not used:') for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(), key=lambda o_nih2: (-o_nih2[1], o_nih2[0])): print(' %9.2f %s' % (nih_bbnu, owner)) print() # Top job steps print('Top job steps, by total time used (step number first):') for (label, step_num), nih_used in sorted(s['job_step_to_nih_used'].items(), key=lambda k_nih: (-k_nih[1], k_nih[0])): if label: print(' %9.2f %3d %s' % (nih_used, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_used, )) print() print('Top job steps, by total time billed but not used (un-pooled only):') for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].items(), key=lambda k_nih3: (-k_nih3[1], k_nih3[0])): if label: print(' %9.2f %3d %s' % (nih_bbnu, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_bbnu, )) print() # Top pools print('All pools, by total time billed:') for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(), key=lambda p_nih: (-p_nih[1], p_nih[0])): print(' %9.2f %s' % (nih_billed, pool or '(not pooled)')) print() print('All pools, by total time billed but not used:') for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(), key=lambda p_nih4: (-p_nih4[1], p_nih4[0])): print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)')) print() # Top clusters print('All clusters, by total time billed:') top_clusters = sorted(s['clusters'], key=lambda cs: (-cs['nih_billed'], cs['name'])) for cs in top_clusters: print(' %9.2f %-15s %s' % (cs['nih_billed'], cs['id'], cs['name'])) print() print('All clusters, by time billed but not used:') top_clusters_bbnu = sorted(s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name'])) for cs in top_clusters_bbnu: print(' %9.2f %-15s %s' % (cs['nih_bbnu'], cs['id'], cs['name'])) print() # Details print('Details for all clusters:') print() print(' id state created steps' ' time ran billed waste user name') all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'], reverse=True) for cs in all_clusters: print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % (cs['id'], cs['state'], cs['created'], cs['num_steps'], strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'], (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
def _boto3_today(): now = _boto3_now() return datetime(now.year, now.month, now.day, tzinfo=now.tzinfo)
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug( 'cluster %s %s for %s, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify( runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info( 'Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % ( num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _cluster_to_basic_summary(cluster, now=None): """Extract fields such as creation time, owner, etc. from the cluster. :param cluster: a :py:mod:`boto3` cluster data structure :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a dictionary with the following keys. These will be ``None`` if the corresponding field in the cluster is unavailable. * *created*: UTC `datetime.datetime` that the cluster was created, or ``None`` * *end*: UTC `datetime.datetime` that the cluster finished, or ``None`` * *id*: cluster ID, or ``None`` (this should never happen) * *label*: The label for the cluster (usually the module name of the :py:class:`~mrjob.job.MRJob` script that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *name*: cluster name, or ``None`` (this should never happen) * *nih*: number of normalized instance hours used by the cluster. * *num_steps*: Number of steps in the cluster. * *owner*: The owner for the cluster (usually the user that started it), or ``None`` for non-:py:mod:`mrjob` clusters. * *pool*: pool name (e.g. ``'default'``) if the cluster is pooled, otherwise ``None``. * *ran*: How long the cluster ran, or has been running, as a :py:class:`datetime.timedelta`. This will be ``timedelta(0)`` if the cluster hasn't started. * *ready*: UTC `datetime.datetime` that the cluster finished bootstrapping, or ``None`` * *state*: The cluster's state as a string (e.g. ``'RUNNING'``) """ if now is None: now = _boto3_now() bcs = {} # basic cluster summary to fill in bcs['id'] = cluster['Id'] bcs['name'] = cluster['Name'] Status = cluster['Status'] Timeline = Status.get('Timeline', {}) bcs['created'] = Timeline.get('CreationDateTime') bcs['ready'] = Timeline.get('ReadyDateTime') bcs['end'] = Timeline.get('EndDateTime') if bcs['created']: bcs['ran'] = (bcs['end'] or now) - bcs['created'] else: bcs['ran'] = timedelta(0) bcs['state'] = Status.get('State') bcs['num_steps'] = len(cluster['Steps']) _, bcs['pool'] = _pool_hash_and_name(cluster) if not bcs['pool']: _, bcs['pool'] = _legacy_pool_hash_and_name( cluster['BootstrapActions']) m = _JOB_KEY_RE.match(bcs['name'] or '') if m: bcs['label'], bcs['owner'] = m.group(1), m.group(2) else: bcs['label'], bcs['owner'] = None, None bcs['nih'] = cluster.get('NormalizedInstanceHours', 0) return bcs
def _print_report(stats, now=None): """Print final report. :param stats: a dictionary returned by :py:func:`_clusters_to_stats` :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. """ if now is None: now = _boto3_now() s = stats if not s['clusters']: print('No clusters created in the past two months!') return print('Total # of Clusters: %d' % len(s['clusters'])) print() print('* All times are in UTC.') print() print('Min create time: %s' % min(cs['created'] for cs in s['clusters'])) print('Max create time: %s' % max(cs['created'] for cs in s['clusters'])) print(' Current time: %s' % now.replace(microsecond=0)) print() print('* All usage is measured in Normalized Instance Hours, which are') print(' roughly equivalent to running an m1.medium instance for an hour.') print(" Billing is estimated, and may not match Amazon's system exactly.") print() # total compute-unit hours used def with_pct(usage): return (usage, _percent(usage, s['nih_billed'])) print('Total billed: %9.2f %5.1f%%' % with_pct(s['nih_billed'])) print(' Total used: %9.2f %5.1f%%' % with_pct(s['nih_used'])) print(' bootstrap: %9.2f %5.1f%%' % with_pct(s['bootstrap_nih_used'])) print(' jobs: %9.2f %5.1f%%' % with_pct(s['job_nih_used'])) print(' Total waste: %9.2f %5.1f%%' % with_pct(s['nih_bbnu'])) print(' at end: %9.2f %5.1f%%' % with_pct(s['end_nih_bbnu'])) print(' other: %9.2f %5.1f%%' % with_pct(s['other_nih_bbnu'])) print() if s['date_to_nih_billed']: print('Daily statistics:') print() print(' date billed used waste % waste') d = max(s['date_to_nih_billed']) while d >= min(s['date_to_nih_billed']): print(' %10s %9.2f %9.2f %9.2f %5.1f' % ( d, s['date_to_nih_billed'].get(d, 0.0), s['date_to_nih_used'].get(d, 0.0), s['date_to_nih_bbnu'].get(d, 0.0), _percent(s['date_to_nih_bbnu'].get(d, 0.0), s['date_to_nih_billed'].get(d, 0.0)))) d -= timedelta(days=1) print() if s['hour_to_nih_billed']: print('Hourly statistics:') print() print(' hour billed used waste % waste') h = max(s['hour_to_nih_billed']) while h >= min(s['hour_to_nih_billed']): print(' %13s %9.2f %9.2f %9.2f %5.1f' % ( h.strftime('%Y-%m-%d %H'), s['hour_to_nih_billed'].get(h, 0.0), s['hour_to_nih_used'].get(h, 0.0), s['hour_to_nih_bbnu'].get(h, 0.0), _percent(s['hour_to_nih_bbnu'].get(h, 0.0), s['hour_to_nih_billed'].get(h, 0.0)))) h -= timedelta(hours=1) print() print('* clusters are considered to belong to the user and job that') print(' started them or last ran on them.') print() # Top jobs print('Top jobs, by total time used:') for label, nih_used in sorted(s['label_to_nih_used'].items(), key=lambda lb_nih: (-lb_nih[1], lb_nih[0])): print(' %9.2f %s' % (nih_used, label)) print() print('Top jobs, by time billed but not used:') for label, nih_bbnu in sorted( s['label_to_nih_bbnu'].items(), key=lambda lb_nih1: (-lb_nih1[1], lb_nih1[0])): print(' %9.2f %s' % (nih_bbnu, label)) print() # Top users print('Top users, by total time used:') for owner, nih_used in sorted(s['owner_to_nih_used'].items(), key=lambda o_nih: (-o_nih[1], o_nih[0])): print(' %9.2f %s' % (nih_used, owner)) print() print('Top users, by time billed but not used:') for owner, nih_bbnu in sorted(s['owner_to_nih_bbnu'].items(), key=lambda o_nih2: (-o_nih2[1], o_nih2[0])): print(' %9.2f %s' % (nih_bbnu, owner)) print() # Top job steps print('Top job steps, by total time used (step number first):') for (label, step_num), nih_used in sorted( s['job_step_to_nih_used'].items(), key=lambda k_nih: (-k_nih[1], k_nih[0])): if label: print(' %9.2f %3d %s' % (nih_used, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_used,)) print() print('Top job steps, by total time billed but not used (un-pooled only):') for (label, step_num), nih_bbnu in sorted( s['job_step_to_nih_bbnu_no_pool'].items(), key=lambda k_nih3: (-k_nih3[1], k_nih3[0])): if label: print(' %9.2f %3d %s' % (nih_bbnu, step_num, label)) else: print(' %9.2f (non-mrjob step)' % (nih_bbnu,)) print() # Top pools print('All pools, by total time billed:') for pool, nih_billed in sorted(s['pool_to_nih_billed'].items(), key=lambda p_nih: (-p_nih[1], p_nih[0])): print(' %9.2f %s' % (nih_billed, pool or '(not pooled)')) print() print('All pools, by total time billed but not used:') for pool, nih_bbnu in sorted(s['pool_to_nih_bbnu'].items(), key=lambda p_nih4: (-p_nih4[1], p_nih4[0])): print(' %9.2f %s' % (nih_bbnu, pool or '(not pooled)')) print() # Top clusters print('All clusters, by total time billed:') top_clusters = sorted(s['clusters'], key=lambda cs: (-cs['nih_billed'], cs['name'])) for cs in top_clusters: print(' %9.2f %-15s %s' % ( cs['nih_billed'], cs['id'], cs['name'])) print() print('All clusters, by time billed but not used:') top_clusters_bbnu = sorted( s['clusters'], key=lambda cs: (-cs['nih_bbnu'], cs['name'])) for cs in top_clusters_bbnu: print(' %9.2f %-15s %s' % ( cs['nih_bbnu'], cs['id'], cs['name'])) print() # Details print('Details for all clusters:') print() print(' id state created steps' ' time ran billed waste user name') all_clusters = sorted(s['clusters'], key=lambda cs: cs['created'], reverse=True) for cs in all_clusters: print(' %-15s %-22s %19s %3d %17s %9.2f %9.2f %8s %s' % ( cs['id'], cs['state'], cs['created'], cs['num_steps'], strip_microseconds(cs['ran']), cs['nih_used'], cs['nih_bbnu'], (cs['owner'] or ''), (cs['label'] or ('not started by mrjob'))))
def _simulate_progress(self, cluster_id, now=None): """Simulate progress on the given cluster. This is automatically run when we call :py:meth:`describe_step`, and, when the cluster is ``TERMINATING``, :py:meth:`describe_cluster`. :type cluster_id: str :param cluster_id: fake cluster ID :type now: py:class:`datetime.datetime` :param now: alternate time to use as the current time (should be UTC) """ # TODO: this doesn't actually update steps to CANCELLED when # cluster is shut down if now is None: now = _boto3_now() cluster = self.mock_emr_clusters[cluster_id] # allow clusters to get stuck if cluster.get('_DelayProgressSimulation', 0) > 0: cluster['_DelayProgressSimulation'] -= 1 return # this code is pretty loose about updating StateChangeReason # (for the cluster, instance groups, and steps). Add this as needed. # if job is STARTING, move it along to BOOTSTRAPPING if cluster['Status']['State'] == 'STARTING': cluster['Status']['State'] = 'BOOTSTRAPPING' # master now has a hostname cluster['MasterPublicDnsName'] = 'master.%s.mock' % cluster['Id'] # instances are now provisioned for ig in cluster['_InstanceGroups']: ig['RunningInstanceCount'] = ig['RequestedInstanceCount'] ig['Status']['State'] = 'BOOTSTRAPPING' return # if job is TERMINATING, move along to terminated if cluster['Status']['State'] == 'TERMINATING': code = cluster['Status']['StateChangeReason'].get('Code') if code and code.endswith('_FAILURE'): cluster['Status']['State'] = 'TERMINATED_WITH_ERRORS' else: cluster['Status']['State'] = 'TERMINATED' return # if job is done, nothing to do if cluster['Status']['State'] in ('TERMINATED', 'TERMINATED_WITH_ERRORS'): return # if job is BOOTSTRAPPING, move it along to RUNNING and continue if cluster['Status']['State'] == 'BOOTSTRAPPING': cluster['Status']['State'] = 'RUNNING' for ig in cluster['_InstanceGroups']: ig['Status']['State'] = 'RUNNING' # at this point, should be RUNNING or WAITING assert cluster['Status']['State'] in ('RUNNING', 'WAITING') # simulate self-termination if cluster_id in self.mock_emr_self_termination: cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason'] = dict( Code='INSTANCE_FAILURE', Message='The master node was terminated. ', # sic ) for step in cluster['_Steps']: if step['Status']['State'] in ('PENDING', 'RUNNING'): step['Status']['State'] = 'CANCELLED' # not INTERRUPTED return # try to find the next step, and advance it for step_num, step in enumerate(cluster['_Steps']): # skip steps that are already done if step['Status']['State'] in ('COMPLETED', 'FAILED', 'CANCELLED', 'INTERRUPTED'): continue # found currently running step! handle it, then exit # start PENDING step if step['Status']['State'] == 'PENDING': step['Status']['State'] = 'RUNNING' step['Status']['Timeline']['StartDateTime'] = now return assert step['Status']['State'] == 'RUNNING' # check if we're supposed to have an error if (cluster_id, step_num) in self.mock_emr_failures: step['Status']['State'] = 'FAILED' if step['ActionOnFailure'] in ('TERMINATE_CLUSTER', 'TERMINATE_JOB_FLOW'): cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason']['Code'] = ( 'STEP_FAILURE') cluster['Status']['StateChangeReason']['Message'] = ( 'Shut down as step failed') for step in cluster['_Steps']: if step['Status']['State'] in ('PENDING', 'RUNNING'): step['Status']['State'] = 'CANCELLED' return # complete step step['Status']['State'] = 'COMPLETED' step['Status']['Timeline']['EndDateTime'] = now # create fake output if we're supposed to write to S3 output_uri = self._get_step_output_uri(step['Config']['Args']) if output_uri and is_s3_uri(output_uri): mock_output = self.mock_emr_output.get( (cluster_id, step_num)) or [b''] bucket_name, key_name = parse_s3_uri(output_uri) # write output to S3 for i, part in enumerate(mock_output): add_mock_s3_data( self.mock_s3_fs, {bucket_name: { key_name + 'part-%05d' % i: part }}) elif (cluster_id, step_num) in self.mock_emr_output: raise AssertionError( "can't use output for cluster ID %s, step %d " "(it doesn't output to S3)" % (cluster_id, step_num)) # done! # if this is the last step, continue to autotermination code, below if step_num < len(cluster['_Steps']) - 1: return # no pending steps. should we wait, or shut down? if cluster['AutoTerminate']: cluster['Status']['State'] = 'TERMINATING' cluster['Status']['StateChangeReason']['Code'] = ( 'ALL_STEPS_COMPLETED') cluster['Status']['StateChangeReason']['Message'] = ( 'Steps Completed') else: # just wait cluster['Status']['State'] = 'WAITING' cluster['Status']['StateChangeReason'] = {} return
def _find_long_running_jobs(emr_client, cluster_summaries, min_time, now=None): """Identify jobs that have been running or pending for a long time. :param clusters: a list of :py:mod:`boto3` cluster summary data structures :param min_time: a :py:class:`datetime.timedelta`: report jobs running or pending longer than this :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. For each job that is running or pending longer than *min_time*, yields a dictionary with the following keys: * *cluster_id*: the cluster's unique ID (e.g. ``j-SOMECLUSTER``) * *name*: name of the step, or the cluster when bootstrapping * *state*: state of the step (``'RUNNING'`` or ``'PENDING'``) or, if there is no step, the cluster (``'STARTING'`` or ``'BOOTSTRAPPING'``) * *time*: amount of time step was running or pending, as a :py:class:`datetime.timedelta` """ if now is None: now = _boto3_now() for cs in cluster_summaries: # special case for jobs that are taking a long time to bootstrap if cs['Status']['State'] in ('STARTING', 'BOOTSTRAPPING'): # there isn't a way to tell when the cluster stopped being # provisioned and started bootstrapping, so just measure # from cluster creation time created = cs['Status']['Timeline']['CreationDateTime'] time_running = now - created if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': cs['Name'], 'state': cs['Status']['State'], 'time': time_running}) # the default case: running clusters if cs['Status']['State'] != 'RUNNING': continue steps = list(reversed(list(_boto3_paginate( 'Steps', emr_client, 'list_steps', ClusterId=cs['Id'])))) running_steps = [ step for step in steps if step['Status']['State'] == 'RUNNING'] pending_steps = [ step for step in steps if step['Status']['State'] == 'PENDING'] if running_steps: # should be only one, but if not, we should know about it for step in running_steps: start = step['Status']['Timeline']['StartDateTime'] time_running = now - start if time_running >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_running}) # sometimes EMR says it's "RUNNING" but doesn't actually run steps! elif pending_steps: step = pending_steps[0] # PENDING job should have run starting when the cluster # became ready, or the previous step completed start = cs['Status']['Timeline']['ReadyDateTime'] for step in steps: if step['Status']['State'] == 'COMPLETED': start = step['Status']['Timeline']['EndDateTime'] time_pending = now - start if time_pending >= min_time: yield({'cluster_id': cs['Id'], 'name': step['Name'], 'state': step['Status']['State'], 'time': time_pending})
def run_job_flow(self, **kwargs): # going to pop params from kwargs as we process then, and raise # NotImplementedError at the end if any params are left now = kwargs.pop('_Now', _boto3_now()) # our newly created cluster, as described by describe_cluster(), plus: # # _BootstrapActions: as described by list_bootstrap_actions() # _InstanceGroups: as described by list_instance_groups() # _Steps: as decribed by list_steps(), but not reversed # # TODO: at some point when we implement instance fleets, # _InstanceGroups will become optional cluster = dict( _BootstrapActions=[], _InstanceGroups=[], _Steps=[], Applications=[], AutoTerminate=True, Configurations=[], Ec2InstanceAttributes=dict( EmrManagedMasterSecurityGroup='sg-mockmaster', EmrManagedSlaveSecurityGroup='sg-mockslave', IamInstanceProfile='', ), Id='j-MOCKCLUSTER%d' % len(self.mock_emr_clusters), Name='', NormalizedInstanceHours=0, ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION', ServiceRole='', Status=dict( State='STARTING', StateChangeReason={}, Timeline=dict(CreationDateTime=now), ), Tags=[], TerminationProtected=False, VisibleToAllUsers=False, ) def _error(message): return _ValidationException('RunJobFlow', message) # Name (required) _validate_param(kwargs, 'Name', string_types) cluster['Name'] = kwargs.pop('Name') # LogUri if 'LogUri' in kwargs: _validate_param(kwargs, 'LogUri', string_types) cluster['LogUri'] = kwargs.pop('LogUri') # JobFlowRole and ServiceRole (required) _validate_param(kwargs, 'JobFlowRole', string_types) cluster['Ec2InstanceAttributes']['IamInstanceProfile'] = kwargs.pop( 'JobFlowRole') if 'ServiceRole' not in kwargs: # required by API, not boto3 raise _error('ServiceRole is required for creating cluster.') _validate_param(kwargs, 'ServiceRole', string_types) cluster['ServiceRole'] = kwargs.pop('ServiceRole') # AmiVersion and ReleaseLabel for version_param in ('AmiVersion', 'ReleaseLabel'): if version_param in kwargs: _validate_param(kwargs, version_param, string_types) if 'AmiVersion' in kwargs: if 'ReleaseLabel' in kwargs: raise _error( 'Only one AMI version and release label may be specified.' ' Provided AMI: %s, release label: %s.' % (kwargs['AmiVersion'], kwargs['ReleaseLabel'])) AmiVersion = kwargs.pop('AmiVersion') running_ami_version = AMI_VERSION_ALIASES.get( AmiVersion, AmiVersion) if version_gte(running_ami_version, '4'): raise _error('The supplied ami version is invalid.') elif not version_gte(running_ami_version, '2'): raise _error( 'Job flow role is not compatible with the supplied' ' AMI version') cluster['RequestedAmiVersion'] = AmiVersion cluster['RunningAmiVersion'] = running_ami_version elif 'ReleaseLabel' in kwargs: ReleaseLabel = kwargs.pop('ReleaseLabel') running_ami_version = ReleaseLabel.lstrip('emr-') if not version_gte(running_ami_version, '4'): raise _error('The supplied release label is invalid: %s.' % ReleaseLabel) cluster['ReleaseLabel'] = ReleaseLabel else: # note: you can't actually set Hadoop version through boto3 raise _error('Must specify exactly one of the following:' ' release label, AMI version, or Hadoop version.') # Applications hadoop_version = map_version(running_ami_version, AMI_HADOOP_VERSION_UPDATES) if version_gte(running_ami_version, '4'): application_names = set(a['Name'] for a in kwargs.pop('Applications', [])) # if Applications is set but doesn't include Hadoop, the # cluster description won't either! (Even though Hadoop is # in fact installed.) if not application_names: application_names = set(['Hadoop']) for app_name in sorted(application_names): if app_name == 'Hadoop': version = hadoop_version else: version = DUMMY_APPLICATION_VERSION cluster['Applications'].append( dict(Name=app_name, Version=version)) else: if kwargs.get('Applications'): raise _error( 'Cannot specify applications when AMI version is used.' ' Specify supported products or new supported products' ' instead.') # 'hadoop' is lowercase if AmiVersion specified cluster['Applications'].append( dict(Name='hadoop', Version=hadoop_version)) # Configurations if 'Configurations' in kwargs: _validate_param(kwargs, 'Configurations', (list, tuple)) if kwargs['Configurations'] and not version_gte( running_ami_version, '4'): raise _ValidationException( 'RunJobFlow', 'Cannot specify configurations when AMI version is used.') cluster['Configurations'] = _normalized_configurations( kwargs.pop('Configurations')) # VisibleToAllUsers if 'VisibleToAllUsers' in kwargs: _validate_param(kwargs, 'VisibleToAllUsers', bool) cluster['VisibleToAllUsers'] = kwargs.pop('VisibleToAllUsers') # pass BootstrapActions off to helper if 'BootstrapActions' in kwargs: self._add_bootstrap_actions('RunJobFlow', kwargs.pop('BootstrapActions'), cluster) # pass Instances (required) off to helper _validate_param(kwargs, 'Instances') self._add_instances('RunJobFlow', kwargs.pop('Instances'), cluster, now=now) # pass Steps off to helper if 'Steps' in kwargs: self._add_steps('RunJobFlow', kwargs.pop('Steps'), cluster) # pass Tags off to helper if 'Tags' in kwargs: self._add_tags('RunJobFlow', kwargs.pop('Tags'), cluster) # save AdditionalInfo if 'AdditionalInfo' in kwargs: cluster['_AdditionalInfo'] = kwargs.pop('AdditionalInfo') # catch extra params if kwargs: raise NotImplementedError( 'mock RunJobFlow does not support these parameters: %s' % ', '.join(sorted(kwargs))) self.mock_emr_clusters[cluster['Id']] = cluster return dict(JobFlowId=cluster['Id'])
def _maybe_terminate_clusters(dry_run=False, max_mins_idle=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_mins_idle is None: max_mins_idle = _DEFAULT_MAX_MINS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # include RUNNING to catch clusters with PENDING jobs that # never ran (see #365). for cluster_summary in _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['WAITING', 'RUNNING']): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) is_pending = _cluster_has_pending_steps(steps) # need to get actual cluster to see tags cluster = emr_client.describe_cluster(ClusterId=cluster_id)['Cluster'] _, pool = _pool_hash_and_name(cluster) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s (%s) - %s' % ( cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'], 'protected' if cluster['TerminationProtected'] else 'unprotected', )) # filter out clusters that don't meet our criteria if (max_mins_idle is not None and time_idle <= timedelta(minutes=max_mins_idle)): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue if cluster['TerminationProtected']: continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _cluster_to_usage_data(cluster, basic_summary=None, now=None): """Break billing/usage information for a cluster down by job. :param cluster: a :py:mod:`boto3` cluster data structure :param basic_summary: a basic summary of the cluster, returned by :py:func:`_cluster_to_basic_summary`. If this is ``None``, we'll call :py:func:`_cluster_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the cluster hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the cluster. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours billed/used/billed but not used during the hour starting at that time * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the cluster * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the cluster * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ bcs = basic_summary or _cluster_to_basic_summary(cluster) if now is None: now = _boto3_now() if not bcs['created']: return [] # Figure out billing rate per second for the job, given that # normalizedinstancehours is how much we're charged up until # the next full hour. full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0) nih_per_sec = bcs['nih'] / (full_hours * 3600.0) # Don't actually count a step as billed for the full hour until # the cluster finishes. This means that our total "nih_billed" # will be less than normalizedinstancehours in the cluster, but it # also keeps stats stable for steps that have already finished. if bcs['end']: cluster_end_billing = bcs['created'] + timedelta(hours=full_hours) else: cluster_end_billing = now intervals = [] # make a fake step for cluster startup and bootstrapping, so we don't # consider that wasted. intervals.append({ 'label': bcs['label'], 'owner': bcs['owner'], 'start': bcs['created'], 'end': bcs['ready'] or bcs['end'] or now, 'step_num': None, }) for step in cluster['Steps']: Status = step['Status'] Timeline = Status.get('Timeline', {}) # we've reached the last step that's actually run if not Timeline.get('StartDateTime'): break step_start = Timeline['StartDateTime'] step_end = Timeline.get('EndDateTime') if step_end is None: # step started running and was cancelled. credit it for 0 usage if bcs['end']: step_end = step_start # step is still running else: step_end = now m = _STEP_NAME_RE.match(step['Name']) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append({ 'label': step_label, 'owner': step_owner, 'start': step_start, 'end': step_end, 'step_num': step_num, }) # fill in end_billing for i in range(len(intervals) - 1): intervals[i]['end_billing'] = intervals[i + 1]['start'] intervals[-1]['end_billing'] = cluster_end_billing # fill normalized usage information for interval in intervals: interval['nih_used'] = ( nih_per_sec * timedelta.total_seconds(interval['end'] - interval['start'])) interval['date_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date( interval['start'], interval['end']).items()) interval['hour_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour( interval['start'], interval['end']).items()) interval['nih_billed'] = ( nih_per_sec * timedelta.total_seconds(interval['end_billing'] - interval['start'])) interval['date_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date( interval['start'], interval['end_billing']).items()) interval['hour_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour( interval['start'], interval['end_billing']).items()) # time billed but not used interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used'] interval['date_to_nih_bbnu'] = {} for d, nih_billed in interval['date_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['date_to_nih_bbnu'][d] = nih_bbnu interval['hour_to_nih_bbnu'] = {} for d, nih_billed in interval['hour_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['hour_to_nih_bbnu'][d] = nih_bbnu return intervals
def _maybe_terminate_clusters(dry_run=False, max_hours_idle=None, mins_to_end_of_hour=None, now=None, pool_name=None, pooled_only=False, unpooled_only=False, max_mins_locked=None, quiet=False, **kwargs): if now is None: now = _boto3_now() # old default behavior if max_hours_idle is None and mins_to_end_of_hour is None: max_hours_idle = _DEFAULT_MAX_HOURS_IDLE runner = EMRJobRunner(**kwargs) emr_client = runner.make_emr_client() num_starting = 0 num_bootstrapping = 0 num_done = 0 num_idle = 0 num_pending = 0 num_running = 0 # We don't filter by cluster state because we want this to work even # if Amazon adds another kind of idle state. for cluster_summary in _boto3_paginate('Clusters', emr_client, 'list_clusters'): cluster_id = cluster_summary['Id'] # check if cluster is done if _is_cluster_done(cluster_summary): num_done += 1 continue # check if cluster is starting if _is_cluster_starting(cluster_summary): num_starting += 1 continue # check if cluster is bootstrapping if _is_cluster_bootstrapping(cluster_summary): num_bootstrapping += 1 continue # need steps to learn more about cluster steps = list( reversed( list( _boto3_paginate('Steps', emr_client, 'list_steps', ClusterId=cluster_id)))) if any(_is_step_running(step) for step in steps): num_running += 1 continue # cluster is idle time_idle = now - _time_last_active(cluster_summary, steps) time_to_end_of_hour = _est_time_to_hour(cluster_summary, now=now) is_pending = _cluster_has_pending_steps(steps) bootstrap_actions = list( _boto3_paginate('BootstrapActions', emr_client, 'list_bootstrap_actions', ClusterId=cluster_id)) _, pool = _pool_hash_and_name(bootstrap_actions) if is_pending: num_pending += 1 else: num_idle += 1 log.debug('cluster %s %s for %s, %s to end of hour, %s (%s)' % (cluster_id, 'pending' if is_pending else 'idle', strip_microseconds(time_idle), strip_microseconds(time_to_end_of_hour), ('unpooled' if pool is None else 'in %s pool' % pool), cluster_summary['Name'])) # filter out clusters that don't meet our criteria if (max_hours_idle is not None and time_idle <= timedelta(hours=max_hours_idle)): continue # mins_to_end_of_hour doesn't apply to jobs with pending steps if (mins_to_end_of_hour is not None and (is_pending or time_to_end_of_hour >= timedelta(minutes=mins_to_end_of_hour))): continue if (pooled_only and pool is None): continue if (unpooled_only and pool is not None): continue if (pool_name is not None and pool != pool_name): continue # terminate idle cluster _terminate_and_notify(runner=runner, cluster_id=cluster_id, cluster_name=cluster_summary['Name'], num_steps=len(steps), is_pending=is_pending, time_idle=time_idle, time_to_end_of_hour=time_to_end_of_hour, dry_run=dry_run, max_mins_locked=max_mins_locked, quiet=quiet) log.info('Cluster statuses: %d starting, %d bootstrapping, %d running,' ' %d pending, %d idle, %d done' % (num_starting, num_bootstrapping, num_running, num_pending, num_idle, num_done))
def _cluster_to_usage_data(cluster, basic_summary=None, now=None): r"""Break billing/usage information for a cluster down by job. :param cluster: a :py:mod:`boto3` cluster data structure :param basic_summary: a basic summary of the cluster, returned by :py:func:`_cluster_to_basic_summary`. If this is ``None``, we'll call :py:func:`_cluster_to_basic_summary` ourselves. :param now: the current UTC time, as a :py:class:`datetime.datetime`. Defaults to the current time. Returns a list of dictionaries containing usage information, one for bootstrapping, and one for each step that ran or is currently running. If the cluster hasn't started yet, return ``[]``. Usage dictionaries have the following keys: * *end*: when the job finished running, or *now* if it's still running. * *end_billing*: the effective end of the job for billing purposes, either when the next job starts, the current time if the job is still running, or the end of the next full hour in the cluster. * *nih_billed*: normalized instances hours billed for this job or bootstrapping step * *nih_used*: normalized instance hours actually used for running the job or bootstrapping * *nih_bbnu*: usage billed but not used (`nih_billed - nih_used`) * *date_to_nih_\**: map from a :py:class:`datetime.date` to number of normalized instance hours billed/used/billed but not used on that date * *hour_to_nih_\**: map from a :py:class:`datetime.datetime` to number of normalized instance hours billed/used/billed but not used during the hour starting at that time * *label*: job's label (usually the module name of the job), or for the bootstrapping step, the label of the cluster * *owner*: job's owner (usually the user that started it), or for the bootstrapping step, the owner of the cluster * *start*: when the job or bootstrapping step started, as a :py:class:`datetime.datetime` """ bcs = basic_summary or _cluster_to_basic_summary(cluster) if now is None: now = _boto3_now() if not bcs['created']: return [] # EMR no longer bills by the full hour, but NormalizedInstanceHours # still works that way full_hours = math.ceil(timedelta.total_seconds(bcs['ran']) / 60.0 / 60.0) nih_per_sec = bcs['nih'] / (full_hours * 3600.0) # EMR bills by the full second, and at least one minute per cluster cluster_end_billing = bcs['created'] + max( _round_up_to_next_second(bcs['ran']), timedelta(minutes=1)) intervals = [] # make a fake step for cluster startup and bootstrapping, so we don't # consider that wasted. intervals.append({ 'label': bcs['label'], 'owner': bcs['owner'], 'start': bcs['created'], 'end': bcs['ready'] or bcs['end'] or now, 'step_num': None, }) for step in cluster['Steps']: Status = step['Status'] Timeline = Status.get('Timeline', {}) # we've reached the last step that's actually run if not Timeline.get('StartDateTime'): break step_start = Timeline['StartDateTime'] step_end = Timeline.get('EndDateTime') if step_end is None: # step started running and was cancelled. credit it for 0 usage if bcs['end']: step_end = step_start # step is still running else: step_end = now m = _STEP_NAME_RE.match(step['Name']) if m: step_label = m.group(1) step_owner = m.group(2) step_num = int(m.group(6)) else: step_label, step_owner, step_num = None, None, None intervals.append({ 'label': step_label, 'owner': step_owner, 'start': step_start, 'end': step_end, 'step_num': step_num, }) # fill in end_billing for i in range(len(intervals) - 1): intervals[i]['end_billing'] = intervals[i + 1]['start'] intervals[-1]['end_billing'] = cluster_end_billing # fill normalized usage information for interval in intervals: interval['nih_used'] = ( nih_per_sec * timedelta.total_seconds(interval['end'] - interval['start'])) interval['date_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(interval['start'], interval['end']).items()) interval['hour_to_nih_used'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(interval['start'], interval['end']).items()) interval['nih_billed'] = ( nih_per_sec * timedelta.total_seconds( interval['end_billing'] - interval['start'])) interval['date_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_date(interval['start'], interval['end_billing']).items()) interval['hour_to_nih_billed'] = dict( (d, nih_per_sec * secs) for d, secs in _subdivide_interval_by_hour(interval['start'], interval['end_billing']).items()) # time billed but not used interval['nih_bbnu'] = interval['nih_billed'] - interval['nih_used'] interval['date_to_nih_bbnu'] = {} for d, nih_billed in interval['date_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['date_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['date_to_nih_bbnu'][d] = nih_bbnu interval['hour_to_nih_bbnu'] = {} for d, nih_billed in interval['hour_to_nih_billed'].items(): nih_bbnu = nih_billed - interval['hour_to_nih_used'].get(d, 0.0) if nih_bbnu: interval['hour_to_nih_bbnu'][d] = nih_bbnu return intervals
def _add_instances(self, operation_name, Instances, cluster, now=None): """Handle Instances param from run_job_flow()""" if now is None: now = _boto3_now() _validate_param_type(Instances, dict) Instances = dict(Instances) # going to pop params from Instances def _error(message): return _ValidationException(operation_name, message) # Ec2KeyName if 'Ec2KeyName' in Instances: _validate_param(Instances, 'Ec2KeyName', string_types) cluster['Ec2InstanceAttributes']['Ec2KeyName'] = Instances.pop( 'Ec2KeyName') # Ec2SubnetId if 'Ec2SubnetId' in Instances: _validate_param(Instances, 'Ec2SubnetId', string_types) cluster['Ec2InstanceAttributes']['Ec2SubnetId'] = ( Instances.pop('Ec2SubnetId')) # KeepJobFlowAliveWhenNoSteps if 'KeepJobFlowAliveWhenNoSteps' in Instances: _validate_param(Instances, 'KeepJobFlowAliveWhenNoSteps', bool) cluster['AutoTerminate'] = ( not Instances.pop('KeepJobFlowAliveWhenNoSteps')) # Placement (availability zone) if 'Placement' in Instances: _validate_param(Instances, 'Placement', dict) Placement = Instances.pop('Placement') # mock_boto3 doesn't support the 'AvailabilityZones' param _validate_param(Placement, 'AvailabilityZone', string_types) cluster['Ec2InstanceAttributes']['Ec2AvailabilityZone'] = ( Placement['AvailabilityZone']) if 'InstanceGroups' in Instances: if any(x in Instances for x in ('MasterInstanceType', 'SlaveInstanceType', 'InstanceCount')): raise _error( 'Please configure instances using one and only one of the' ' following: instance groups; instance fleets; instance' ' count, master and slave instance type.') self._add_instance_groups(operation_name, Instances.pop('InstanceGroups'), cluster) # TODO: will need to support instance fleets at some point else: # build our own instance groups instance_groups = [] instance_count = Instances.pop('InstanceCount', 0) _validate_param_type(instance_count, integer_types) # note: boto3 actually lets 'null' fall through to the API here _validate_param(Instances, 'MasterInstanceType', string_types) instance_groups.append( dict(InstanceRole='MASTER', InstanceType=Instances.pop('MasterInstanceType'), InstanceCount=1)) if 'SlaveInstanceType' in Instances: SlaveInstanceType = Instances.pop('SlaveInstanceType') _validate_param_type(SlaveInstanceType, string_types) # don't create a group with no instances! if instance_count > 1: instance_groups.append( dict(InstanceRole='CORE', InstanceType=SlaveInstanceType, InstanceCount=instance_count - 1)) self._add_instance_groups(operation_name, instance_groups, cluster, now=now) if Instances: raise NotImplementedError( 'mock %s does not support these parameters: %s' % (operation_name, ', '.join('Instances.%s' % k for k in sorted(Instances))))
def _add_steps(self, operation_name, Steps, cluster, now=None): if now is None: now = _boto3_now() _validate_param_type(Steps, (list, tuple)) # only active job flows allowed if cluster['Status']['State'].startswith('TERMINAT'): raise _ValidationException( operation_name, 'A job flow that is shutting down, terminated, or finished' ' may not be modified.') # no more than 256 steps allowed if cluster.get('RunningAmiVersion') and map_version( cluster['RunningAmiVersion'], LIFETIME_STEP_LIMIT_AMI_VERSIONS): # for very old AMIs, *all* steps count if len(cluster['_Steps']) + len(Steps) > STEP_ADD_LIMIT: raise _ValidationException( operation_name, 'Maximum number of steps for job flow exceeded') else: # otherwise, only active and pending steps count num_active_steps = sum( 1 for step in cluster['_Steps'] if step['Status']['State'] in ('PENDING', 'PENDING_CANCELLED', 'RUNNING')) if num_active_steps + len(Steps) > STEP_ADD_LIMIT: raise _ValidationException( operation_name, "Maximum number of active steps(State = 'Running'," " 'Pending' or 'Cancel_Pending') for cluster exceeded.") new_steps = [] for i, Step in enumerate(Steps): Step = dict(Step) new_step = dict( ActionOnFailure='TERMINATE_CLUSTER', Config=dict( Args=[], Jar={}, Properties={}, ), Id='s-MOCKSTEP%d' % (len(cluster['_Steps']) + i), Name='', Status=dict( State='PENDING', StateChangeReason={}, Timeline=dict(CreationDateTime=now), ), ) # Name (required) _validate_param(Step, 'Name', string_types) new_step['Name'] = Step.pop('Name') # ActionOnFailure if 'ActionOnFailure' in Step: _validate_param_enum(Step['ActionOnFailure'], [ 'CANCEL_AND_WAIT', 'CONTINUE', 'TERMINATE_JOB_FLOW', 'TERMINATE_CLUSTER' ]) new_step['ActionOnFailure'] = Step.pop('ActionOnFailure') # HadoopJarStep (required) _validate_param(Step, 'HadoopJarStep', dict) HadoopJarStep = dict(Step.pop('HadoopJarStep')) _validate_param(HadoopJarStep, 'Jar', string_types) new_step['Config']['Jar'] = HadoopJarStep.pop('Jar') if 'Args' in HadoopJarStep: Args = HadoopJarStep.pop('Args') _validate_param_type(Args, (list, tuple)) for arg in Args: _validate_param_type(arg, string_types) new_step['Config']['Args'].extend(Args) if 'MainClass' in HadoopJarStep: _validate_param(HadoopJarStep, 'MainClass', string_types) new_step['Config']['MainClass'] = HadoopJarStep.pop( 'MainClass') # we don't currently support Properties if HadoopJarStep: raise NotImplementedError( "mock_boto3 doesn't support these HadoopJarStep params: %s" % ', '.join(sorted(HadoopJarStep))) if Step: raise NotImplementedError( "mock_boto3 doesn't support these step params: %s" % ', '.join(sorted(Step))) new_steps.append(new_step) cluster['_Steps'].extend(new_steps) # add_job_flow_steps() needs to return step IDs return [new_step['Id'] for new_step in new_steps]
def _add_instance_groups(self, operation_name, InstanceGroups, cluster, now=None): """Add instance groups from *InstanceGroups* to the mock cluster *cluster*. """ _validate_param_type(InstanceGroups, (list, tuple)) def _error(message): return _ValidationException(operation_name, message) if now is None: now = _boto3_now() # currently, this is just a helper method for run_job_flow() if cluster.get('_InstanceGroups'): raise NotImplementedError( "mock_boto3 doesn't support adding instance groups") new_igs = [] # don't update _InstanceGroups if there's an error roles = set() # roles already handled for i, InstanceGroup in enumerate(InstanceGroups): _validate_param_type(InstanceGroup, dict) InstanceGroup = dict(InstanceGroup) # our new mock instance group ig = dict( Configurations=[], EbsBlockDevices=[], Id='ig-FAKE', InstanceGroupType='', Market='ON_DEMAND', RequestedInstanceCount=0, RunningInstanceCount=0, ShrinkPolicy={}, Status=dict( State='PROVISIONING', StateChangeReason=dict(Message=''), Timeline=dict(CreationDateTime=now), ), ) # InstanceRole (required) _validate_param(InstanceGroup, 'InstanceRole', ['MASTER', 'CORE', 'TASK']) role = InstanceGroup.pop('InstanceRole') # check for duplicate roles if role in roles: raise _error('Multiple %s instance groups supplied, you' ' must specify exactly one %s instance group' % (role.lower(), role.lower())) roles.add(role) ig['InstanceGroupType'] = role # InstanceType (required) _validate_param(InstanceGroup, 'InstanceType', string_types) # 3.x AMIs (but not 4.x, etc.) reject m1.small explicitly if (InstanceGroup.get('InstanceType') == 'm1.small' and cluster.get('RunningAmiVersion', '').startswith('3.')): raise _error( 'm1.small instance type is not supported with AMI version' ' %s.' % cluster['RunningAmiVersion']) ig['InstanceType'] = InstanceGroup.pop('InstanceType') # InstanceCount (required) _validate_param(InstanceGroup, 'InstanceCount', integer_types) InstanceCount = InstanceGroup.pop('InstanceCount') if InstanceCount < 1: raise _error( 'An instance group must have at least one instance') if role == 'MASTER' and InstanceCount != 1: raise _error( 'A master instance group must specify a single instance') ig['RequestedInstanceCount'] = InstanceCount # Name if 'Name' in InstanceGroup: _validate_param(InstanceGroup, 'Name', string_types) ig['Name'] = InstanceGroup.pop('Name') # Market (default set above) if 'Market' in InstanceGroup: _validate_param(InstanceGroup, 'Market', string_types) if InstanceGroup['Market'] not in ('ON_DEMAND', 'SPOT'): raise _error( "1 validation error detected: value '%s' at" " 'instances.instanceGroups.%d.member.market' failed" " to satify constraint: Member must satisfy enum value" " set: [SPOT, ON_DEMAND]" % (role, i + 1)) ig['Market'] = InstanceGroup.pop('Market') # BidPrice if 'BidPrice' in InstanceGroup: # not float, surprisingly _validate_param(InstanceGroup, 'BidPrice', string_types) if ig['Market'] != 'SPOT': raise _error('Attempted to set bid price for on demand' ' instance group.') # simulate bid price validation BidPrice = InstanceGroup.pop('BidPrice') try: if not float(BidPrice) > 0: raise _error('The bid price is negative or zero.') except (TypeError, ValueError): raise _error( 'The bid price supplied for an instance group is' ' invalid') if '.' in BidPrice and len(BidPrice.split('.', 1)[1]) > 3: raise _error('No more than 3 digits are allowed after' ' decimal place in bid price') ig['BidPrice'] = BidPrice if InstanceGroup: raise NotImplementedError( 'mock_boto3 does not support these InstanceGroup' ' params: %s' % ', '.join(sorted(InstanceGroup))) new_igs.append(ig) # TASK roles require CORE roles (to host HDFS) if 'TASK' in roles and 'CORE' not in roles: raise _error( 'Clusters with task nodes must also define core nodes.') # MASTER role is required if 'MASTER' not in roles: raise _error('Zero master instance groups supplied, you must' ' specify exactly one master instance group') cluster['_InstanceGroups'].extend(new_igs)
def create_fake_clusters(self): self.now = _boto3_now().replace(microsecond=0) self.add_mock_s3_data({'my_bucket': {}}) # create a timestamp the given number of *hours*, *minutes*, etc. # in the past def ago(**kwargs): return self.now - timedelta(**kwargs) # Build a step object easily # also make it respond to .args() def step(jar='/home/hadoop/contrib/streaming/hadoop-streaming.jar', args=self._DEFAULT_STEP_ARGS, state='COMPLETED', created=None, started=None, ended=None, name='Streaming Step', action_on_failure='TERMINATE_CLUSTER', **kwargs): timeline = dict() if created: timeline['CreationDateTime'] = created if started: timeline['StartDateTime'] = started if ended: timeline['EndDateTime'] = ended return dict( Config=dict( ActionOnFailure=action_on_failure, Args=args, Jar=jar, ), Status=dict( State=state, Timeline=timeline, ) ) # empty job self.add_mock_emr_cluster( dict( Id='j-EMPTY', TerminationProtected=False, Status=dict( State='STARTING', Timeline=dict( CreationDateTime=ago(hours=10) ), ), ) ) # job that's bootstrapping self.add_mock_emr_cluster(dict( Id='j-BOOTSTRAPPING', TerminationProtected=False, Status=dict( State='BOOTSTRAPPING', Timeline=dict( CreationDateTime=ago(hours=10), ), ), _Steps=[step(created=ago(hours=10), state='PENDING')], )) # currently running job self.add_mock_emr_cluster( dict( Id='j-CURRENTLY_RUNNING', TerminationProtected=False, Status=dict( State='RUNNING', Timeline=dict( CreationDateTime=ago(hours=4, minutes=15), ReadyDateTime=ago(hours=4, minutes=10) ) ), _Steps=[step(started=ago(hours=4), state='RUNNING')] ) ) # finished cluster self.add_mock_emr_cluster(dict( Id='j-DONE', TerminationProtected=False, Status=dict( State='TERMINATED', Timeline=dict( CreationDateTime=ago(hours=10), ReadyDateTime=ago(hours=8), EndDateTime=ago(hours=5), ), ), _Steps=[step(started=ago(hours=8), ended=ago(hours=6))], )) # idle cluster self.add_mock_emr_cluster(dict( Id='j-DONE_AND_IDLE', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) # idle cluster with 4.x step format. should still be # recognizable as a streaming step self.add_mock_emr_cluster(dict( Id='j-DONE_AND_IDLE_4_X', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2), jar='command-runner.jar', args=['hadoop-streaming'] + self._DEFAULT_STEP_ARGS)], )) # idle cluster with an active lock self.add_mock_emr_cluster(dict( Id='j-IDLE_AND_LOCKED', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) self.add_mock_s3_data({ 'my_bucket': { 'locks/j-IDLE_AND_LOCKED/2': b'not_you', }, }) # idle cluster with an expired lock self.add_mock_emr_cluster(dict( Id='j-IDLE_AND_EXPIRED', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) self.add_mock_s3_data({ 'my_bucket': { 'locks/j-IDLE_AND_EXPIRED/2': b'not_you', }, }, age=timedelta(minutes=5)) # idle cluster with an expired lock self.add_mock_emr_cluster(dict( Id='j-IDLE_BUT_INCOMPLETE_STEPS', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), end_hours_ago=None)], )) # custom hadoop streaming jar self.add_mock_emr_cluster(dict( Id='j-CUSTOM_DONE_AND_IDLE', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step( started=ago(hours=4), ended=ago(hours=4), jar=('s3://my_bucket/tmp/somejob/files/' 'oddjob-0.0.3-SNAPSHOT-standalone.jar'), args=[], )], )) # idle cluster, termination protected self.add_mock_emr_cluster(dict( Id='j-IDLE_AND_PROTECTED', TerminationProtected=True, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[step(started=ago(hours=4), ended=ago(hours=2))], )) # hadoop debugging without any other steps self.add_mock_emr_cluster(dict( Id='j-DEBUG_ONLY', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=3), ReadyDateTime=ago(hours=2, minutes=55), ), ), _Steps=[ step(jar='command-runner.jar', name='Setup Hadoop Debugging', args=['state-pusher-script'], started=ago(hours=3), ended=ago(hours=2)) ], )) # hadoop debugging + actual job self.add_mock_emr_cluster(dict( Id='j-HADOOP_DEBUGGING', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=55), ), ), _Steps=[ step(jar='command-runner.jar', name='Setup Hadoop Debugging', args=['state-pusher-script'], started=ago(hours=5), ended=ago(hours=4)), step(started=ago(hours=4), ended=ago(hours=2)), ], )) # should skip cancelled steps self.add_mock_emr_cluster(dict( Id='j-IDLE_AND_FAILED', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(hours=6), ReadyDateTime=ago(hours=5, minutes=5), ), ), _Steps=[ step(started=ago(hours=4), ended=ago(hours=3), state='FAILED'), step(state='CANCELLED'), ], )) # pooled cluster reaching end of full hour self.add_mock_emr_cluster(dict( _BootstrapActions=[ dict(Args=[], Name='action 0'), dict( Args=['pool-0123456789abcdef0123456789abcdef', 'reflecting'], Name='master', ), ], Id='j-POOLED', TerminationProtected=False, Status=dict( State='WAITING', Timeline=dict( CreationDateTime=ago(minutes=55), ReadyDateTime=ago(minutes=50), ), ), Tags=[ dict(Key='__mrjob_pool_name', Value='reflecting'), dict(Key='__mrjob_pool_hash', Value='0123456789abcdef0123456789abcdef'), ], )) # cluster that has had pending jobs but hasn't run them self.add_mock_emr_cluster(dict( Id='j-PENDING_BUT_IDLE', TerminationProtected=False, Status=dict( State='RUNNING', Timeline=dict( CreationDateTime=ago(hours=3), ReadyDateTime=ago(hours=2, minutes=50), ), ), _Steps=[step(created=ago(hours=3), state='PENDING')], ))