示例#1
0
文件: parse.py 项目: kartheek6/mrjob
def parse_hadoop_counters_from_line(line, hadoop_version=None):
    """Parse Hadoop counter values from a log line.

    The counter log line format changed significantly between Hadoop 0.18 and
    0.20, so this function switches between parsers for them.

    :param line: log line containing counter data
    :type line: str

    :return: (counter_dict, step_num) or (None, None)
    """
    # start with 2.x parsing, which parses the entire line as JSON
    if (hadoop_version is None or
        version_gte(hadoop_version, '2') or
        (version_gte(hadoop_version, '0.21') and
         not version_gte(hadoop_version, '1'))):

        counters, step_num = _parse_counters_from_line_2_0(line)

        # if we found something, or if hadoop_version isn't None, return it
        if counters or hadoop_version:
            return counters, step_num

    m = _COUNTER_LINE_RE.match(line)
    if not m:
        return None, None

    if hadoop_version is None:
        # try both if hadoop_version not specified
        counters_1, step_num_1 = parse_hadoop_counters_from_line(line, '0.20')
        if counters_1:
            return (counters_1, step_num_1)
        else:
            return parse_hadoop_counters_from_line(line, '0.18')

    if uses_020_counters(hadoop_version):
        parse_func = _parse_counters_0_20
    else:
        parse_func = _parse_counters_0_18

    counter_substring = m.group('counters')

    counters = {}
    for group, counter, value in parse_func(counter_substring):
        counters.setdefault(group, {})
        counters[group].setdefault(counter, 0)
        counters[group][counter] += int(value)
    return counters, int(m.group('step_num'))
示例#2
0
def parse_hadoop_counters_from_line(line, hadoop_version=None):
    """Parse Hadoop counter values from a log line.

    The counter log line format changed significantly between Hadoop 0.18 and
    0.20, so this function switches between parsers for them.

    :param line: log line containing counter data
    :type line: str

    :return: (counter_dict, step_num) or (None, None)
    """
    # start with 2.x parsing, which parses the entire line as JSON
    if (hadoop_version is None or version_gte(hadoop_version, '2')
            or (version_gte(hadoop_version, '0.21')
                and not version_gte(hadoop_version, '1'))):

        counters, step_num = _parse_counters_from_line_2_0(line)

        # if we found something, or if hadoop_version isn't None, return it
        if counters or hadoop_version:
            return counters, step_num

    m = _COUNTER_LINE_RE.match(line)
    if not m:
        return None, None

    if hadoop_version is None:
        # try both if hadoop_version not specified
        counters_1, step_num_1 = parse_hadoop_counters_from_line(line, '0.20')
        if counters_1:
            return (counters_1, step_num_1)
        else:
            return parse_hadoop_counters_from_line(line, '0.18')

    if uses_020_counters(hadoop_version):
        parse_func = _parse_counters_0_20
    else:
        parse_func = _parse_counters_0_18

    counter_substring = m.group('counters')

    counters = {}
    for group, counter, value in parse_func(counter_substring):
        counters.setdefault(group, {})
        counters[group].setdefault(counter, 0)
        counters[group][counter] += int(value)
    return counters, int(m.group('step_num'))
示例#3
0
文件: s3.py 项目: cad106uk/mrjob
def _get_bucket(s3_conn, bucket_name):
    """Wrapper for s3_conn.get_bucket().

    This only validate buckets in boto >= 2.25.0, which features quick
    validation using HEAD requests (see Issue #865).
    """
    return s3_conn.get_bucket(bucket_name,
                              validate=version_gte(boto.Version, '2.25.0'))
示例#4
0
def _get_bucket(s3_conn, bucket_name):
    """Wrapper for s3_conn.get_bucket().

    This only validate buckets in boto >= 2.25.0, which features quick
    validation using HEAD requests (see Issue #865).
    """
    return s3_conn.get_bucket(bucket_name,
                              validate=version_gte(boto.Version, '2.25.0'))
示例#5
0
 def mkdir(self, path):
     args = ['fs', '-mkdir']
     if version_gte(self.get_hadoop_version(), "2.0.0"):
         args.append("-p")
     args.append(path)
     try:
         self.invoke_hadoop(args, ok_stderr=[HADOOP_FILE_EXISTS_RE])
     except CalledProcessError:
         raise IOError("Could not mkdir %s" % path)
示例#6
0
文件: emr.py 项目: iyangming/mrjob
    def run_job_flow(self, **kwargs):
        # going to pop params from kwargs as we process then, and raise
        # NotImplementedError at the end if any params are left
        now = kwargs.pop('_Now', _boto3_now())

        # our newly created cluster, as described by describe_cluster(), plus:
        #
        # _BootstrapActions: as described by list_bootstrap_actions()
        # _InstanceGroups: as described by list_instance_groups()
        # _Steps: as decribed by list_steps(), but not reversed
        #
        # TODO: at some point when we implement instance fleets,
        # _InstanceGroups will become optional
        cluster = dict(
            _BootstrapActions=[],
            _InstanceGroups=[],
            _Steps=[],
            Applications=[],
            AutoTerminate=True,
            Configurations=[],
            Ec2InstanceAttributes=dict(
                EmrManagedMasterSecurityGroup='sg-mockmaster',
                EmrManagedSlaveSecurityGroup='sg-mockslave',
                IamInstanceProfile='',
            ),
            Id='j-MOCKCLUSTER%d' % len(self.mock_emr_clusters),
            Name='',
            NormalizedInstanceHours=0,
            ScaleDownBehavior='TERMINATE_AT_TASK_COMPLETION',
            ServiceRole='',
            Status=dict(
                State='STARTING',
                StateChangeReason={},
                Timeline=dict(CreationDateTime=now),
            ),
            Tags=[],
            TerminationProtected=False,
            VisibleToAllUsers=False,
        )

        def _error(message):
            return _ValidationException('RunJobFlow', message)

        # Name (required)
        _validate_param(kwargs, 'Name', string_types)
        cluster['Name'] = kwargs.pop('Name')

        # LogUri
        if 'LogUri' in kwargs:
            _validate_param(kwargs, 'LogUri', string_types)
            cluster['LogUri'] = kwargs.pop('LogUri')

        # JobFlowRole and ServiceRole (required)
        _validate_param(kwargs, 'JobFlowRole', string_types)
        cluster['Ec2InstanceAttributes']['IamInstanceProfile'] = kwargs.pop(
            'JobFlowRole')

        if 'ServiceRole' not in kwargs:  # required by API, not boto3
            raise _error('ServiceRole is required for creating cluster.')
        _validate_param(kwargs, 'ServiceRole', string_types)
        cluster['ServiceRole'] = kwargs.pop('ServiceRole')

        # AmiVersion and ReleaseLabel
        for version_param in ('AmiVersion', 'ReleaseLabel'):
            if version_param in kwargs:
                _validate_param(kwargs, version_param, string_types)

        if 'AmiVersion' in kwargs:
            if 'ReleaseLabel' in kwargs:
                raise _error(
                    'Only one AMI version and release label may be specified.'
                    ' Provided AMI: %s, release label: %s.' %
                    (kwargs['AmiVersion'], kwargs['ReleaseLabel']))

            AmiVersion = kwargs.pop('AmiVersion')

            running_ami_version = AMI_VERSION_ALIASES.get(
                AmiVersion, AmiVersion)

            if version_gte(running_ami_version, '4'):
                raise _error('The supplied ami version is invalid.')
            elif not version_gte(running_ami_version, '2'):
                raise _error(
                    'Job flow role is not compatible with the supplied'
                    ' AMI version')

            cluster['RequestedAmiVersion'] = AmiVersion
            cluster['RunningAmiVersion'] = running_ami_version

        elif 'ReleaseLabel' in kwargs:
            ReleaseLabel = kwargs.pop('ReleaseLabel')
            running_ami_version = ReleaseLabel.lstrip('emr-')

            if not version_gte(running_ami_version, '4'):
                raise _error('The supplied release label is invalid: %s.' %
                             ReleaseLabel)

            cluster['ReleaseLabel'] = ReleaseLabel
        else:
            # note: you can't actually set Hadoop version through boto3
            raise _error('Must specify exactly one of the following:'
                         ' release label, AMI version, or Hadoop version.')

        # Applications
        hadoop_version = map_version(running_ami_version,
                                     AMI_HADOOP_VERSION_UPDATES)

        if version_gte(running_ami_version, '4'):
            application_names = set(a['Name']
                                    for a in kwargs.pop('Applications', []))

            # if Applications is set but doesn't include Hadoop, the
            # cluster description won't either! (Even though Hadoop is
            # in fact installed.)
            if not application_names:
                application_names = set(['Hadoop'])

            for app_name in sorted(application_names):
                if app_name == 'Hadoop':
                    version = hadoop_version
                else:
                    version = DUMMY_APPLICATION_VERSION

                cluster['Applications'].append(
                    dict(Name=app_name, Version=version))
        else:
            if kwargs.get('Applications'):
                raise _error(
                    'Cannot specify applications when AMI version is used.'
                    ' Specify supported products or new supported products'
                    ' instead.')

            # 'hadoop' is lowercase if AmiVersion specified
            cluster['Applications'].append(
                dict(Name='hadoop', Version=hadoop_version))

        # Configurations
        if 'Configurations' in kwargs:
            _validate_param(kwargs, 'Configurations', (list, tuple))

            if kwargs['Configurations'] and not version_gte(
                    running_ami_version, '4'):
                raise _ValidationException(
                    'RunJobFlow',
                    'Cannot specify configurations when AMI version is used.')

            cluster['Configurations'] = _normalized_configurations(
                kwargs.pop('Configurations'))

        # VisibleToAllUsers
        if 'VisibleToAllUsers' in kwargs:
            _validate_param(kwargs, 'VisibleToAllUsers', bool)
            cluster['VisibleToAllUsers'] = kwargs.pop('VisibleToAllUsers')

        # pass BootstrapActions off to helper
        if 'BootstrapActions' in kwargs:
            self._add_bootstrap_actions('RunJobFlow',
                                        kwargs.pop('BootstrapActions'),
                                        cluster)

        # pass Instances (required) off to helper
        _validate_param(kwargs, 'Instances')
        self._add_instances('RunJobFlow',
                            kwargs.pop('Instances'),
                            cluster,
                            now=now)

        # pass Steps off to helper
        if 'Steps' in kwargs:
            self._add_steps('RunJobFlow', kwargs.pop('Steps'), cluster)

        # pass Tags off to helper
        if 'Tags' in kwargs:
            self._add_tags('RunJobFlow', kwargs.pop('Tags'), cluster)

        # save AdditionalInfo
        if 'AdditionalInfo' in kwargs:
            cluster['_AdditionalInfo'] = kwargs.pop('AdditionalInfo')

        # catch extra params
        if kwargs:
            raise NotImplementedError(
                'mock RunJobFlow does not support these parameters: %s' %
                ', '.join(sorted(kwargs)))

        self.mock_emr_clusters[cluster['Id']] = cluster

        return dict(JobFlowId=cluster['Id'])