def parse_hadoop_counters_from_line(line, hadoop_version=None): """Parse Hadoop counter values from a log line. The counter log line format changed significantly between Hadoop 0.18 and 0.20, so this function switches between parsers for them. :param line: log line containing counter data :type line: str :return: (counter_dict, step_num) or (None, None) """ # start with 2.x parsing, which parses the entire line as JSON if (hadoop_version is None or version_gte(hadoop_version, '2') or (version_gte(hadoop_version, '0.21') and not version_gte(hadoop_version, '1'))): counters, step_num = _parse_counters_from_line_2_0(line) # if we found something, or if hadoop_version isn't None, return it if counters or hadoop_version: return counters, step_num m = _COUNTER_LINE_RE.match(line) if not m: return None, None if hadoop_version is None: # try both if hadoop_version not specified counters_1, step_num_1 = parse_hadoop_counters_from_line(line, '0.20') if counters_1: return (counters_1, step_num_1) else: return parse_hadoop_counters_from_line(line, '0.18') if uses_020_counters(hadoop_version): parse_func = _parse_counters_0_20 else: parse_func = _parse_counters_0_18 counter_substring = m.group('counters') counters = {} for group, counter, value in parse_func(counter_substring): counters.setdefault(group, {}) counters[group].setdefault(counter, 0) counters[group][counter] += int(value) return counters, int(m.group('step_num'))
def _get_bucket(s3_conn, bucket_name): """Wrapper for s3_conn.get_bucket(). This only validate buckets in boto >= 2.25.0, which features quick validation using HEAD requests (see Issue #865). """ return s3_conn.get_bucket(bucket_name, validate=version_gte(boto.Version, '2.25.0'))
def mkdir(self, path): args = ['fs', '-mkdir'] if version_gte(self.get_hadoop_version(), "2.0.0"): args.append("-p") args.append(path) try: self.invoke_hadoop(args, ok_stderr=[HADOOP_FILE_EXISTS_RE]) except CalledProcessError: raise IOError("Could not mkdir %s" % path)
def run_job_flow(self, **kwargs): # going to pop params from kwargs as we process then, and raise # NotImplementedError at the end if any params are left now = kwargs.pop('_Now', _boto3_now()) # our newly created cluster, as described by describe_cluster(), plus: # # _BootstrapActions: as described by list_bootstrap_actions() # _InstanceGroups: as described by list_instance_groups() # _Steps: as decribed by list_steps(), but not reversed # # TODO: at some point when we implement instance fleets, # _InstanceGroups will become optional cluster = dict( _BootstrapActions=[], _InstanceGroups=[], _Steps=[], Applications=[], AutoTerminate=True, Configurations=[], Ec2InstanceAttributes=dict( EmrManagedMasterSecurityGroup='sg-mockmaster', EmrManagedSlaveSecurityGroup='sg-mockslave', IamInstanceProfile='', ), Id='j-MOCKCLUSTER%d' % len(self.mock_emr_clusters), Name='', NormalizedInstanceHours=0, ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION', ServiceRole='', Status=dict( State='STARTING', StateChangeReason={}, Timeline=dict(CreationDateTime=now), ), Tags=[], TerminationProtected=False, VisibleToAllUsers=False, ) def _error(message): return _ValidationException('RunJobFlow', message) # Name (required) _validate_param(kwargs, 'Name', string_types) cluster['Name'] = kwargs.pop('Name') # LogUri if 'LogUri' in kwargs: _validate_param(kwargs, 'LogUri', string_types) cluster['LogUri'] = kwargs.pop('LogUri') # JobFlowRole and ServiceRole (required) _validate_param(kwargs, 'JobFlowRole', string_types) cluster['Ec2InstanceAttributes']['IamInstanceProfile'] = kwargs.pop( 'JobFlowRole') if 'ServiceRole' not in kwargs: # required by API, not boto3 raise _error('ServiceRole is required for creating cluster.') _validate_param(kwargs, 'ServiceRole', string_types) cluster['ServiceRole'] = kwargs.pop('ServiceRole') # AmiVersion and ReleaseLabel for version_param in ('AmiVersion', 'ReleaseLabel'): if version_param in kwargs: _validate_param(kwargs, version_param, string_types) if 'AmiVersion' in kwargs: if 'ReleaseLabel' in kwargs: raise _error( 'Only one AMI version and release label may be specified.' ' Provided AMI: %s, release label: %s.' % (kwargs['AmiVersion'], kwargs['ReleaseLabel'])) AmiVersion = kwargs.pop('AmiVersion') running_ami_version = AMI_VERSION_ALIASES.get( AmiVersion, AmiVersion) if version_gte(running_ami_version, '4'): raise _error('The supplied ami version is invalid.') elif not version_gte(running_ami_version, '2'): raise _error( 'Job flow role is not compatible with the supplied' ' AMI version') cluster['RequestedAmiVersion'] = AmiVersion cluster['RunningAmiVersion'] = running_ami_version elif 'ReleaseLabel' in kwargs: ReleaseLabel = kwargs.pop('ReleaseLabel') running_ami_version = ReleaseLabel.lstrip('emr-') if not version_gte(running_ami_version, '4'): raise _error('The supplied release label is invalid: %s.' % ReleaseLabel) cluster['ReleaseLabel'] = ReleaseLabel else: # note: you can't actually set Hadoop version through boto3 raise _error('Must specify exactly one of the following:' ' release label, AMI version, or Hadoop version.') # Applications hadoop_version = map_version(running_ami_version, AMI_HADOOP_VERSION_UPDATES) if version_gte(running_ami_version, '4'): application_names = set(a['Name'] for a in kwargs.pop('Applications', [])) # if Applications is set but doesn't include Hadoop, the # cluster description won't either! (Even though Hadoop is # in fact installed.) if not application_names: application_names = set(['Hadoop']) for app_name in sorted(application_names): if app_name == 'Hadoop': version = hadoop_version else: version = DUMMY_APPLICATION_VERSION cluster['Applications'].append( dict(Name=app_name, Version=version)) else: if kwargs.get('Applications'): raise _error( 'Cannot specify applications when AMI version is used.' ' Specify supported products or new supported products' ' instead.') # 'hadoop' is lowercase if AmiVersion specified cluster['Applications'].append( dict(Name='hadoop', Version=hadoop_version)) # Configurations if 'Configurations' in kwargs: _validate_param(kwargs, 'Configurations', (list, tuple)) if kwargs['Configurations'] and not version_gte( running_ami_version, '4'): raise _ValidationException( 'RunJobFlow', 'Cannot specify configurations when AMI version is used.') cluster['Configurations'] = _normalized_configurations( kwargs.pop('Configurations')) # VisibleToAllUsers if 'VisibleToAllUsers' in kwargs: _validate_param(kwargs, 'VisibleToAllUsers', bool) cluster['VisibleToAllUsers'] = kwargs.pop('VisibleToAllUsers') # pass BootstrapActions off to helper if 'BootstrapActions' in kwargs: self._add_bootstrap_actions('RunJobFlow', kwargs.pop('BootstrapActions'), cluster) # pass Instances (required) off to helper _validate_param(kwargs, 'Instances') self._add_instances('RunJobFlow', kwargs.pop('Instances'), cluster, now=now) # pass Steps off to helper if 'Steps' in kwargs: self._add_steps('RunJobFlow', kwargs.pop('Steps'), cluster) # pass Tags off to helper if 'Tags' in kwargs: self._add_tags('RunJobFlow', kwargs.pop('Tags'), cluster) # save AdditionalInfo if 'AdditionalInfo' in kwargs: cluster['_AdditionalInfo'] = kwargs.pop('AdditionalInfo') # catch extra params if kwargs: raise NotImplementedError( 'mock RunJobFlow does not support these parameters: %s' % ', '.join(sorted(kwargs))) self.mock_emr_clusters[cluster['Id']] = cluster return dict(JobFlowId=cluster['Id'])