示例#1
0
    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step['jar'].startswith('file:///'):
            jar = step['jar'][7:]  # keep leading slash
        else:
            jar = step['jar']

        args = (self._opts['hadoop_bin'] + ['jar', jar])

        if step.get('main_class'):
            args.append(step['main_class'])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ','.join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get('args'):
            args.extend(interpolate(arg) for arg in step['args'])

        return args
示例#2
0
文件: hadoop.py 项目: ZhouYunan/mrjob
    def _args_for_jar_step(self, step_num):
        step = self._get_step(step_num)

        # special case for consistency with EMR runner.
        #
        # This might look less like duplicated code if we ever
        # implement #780 (fetching jars from URIs)
        if step["jar"].startswith("file:///"):
            jar = step["jar"][7:]  # keep leading slash
        else:
            jar = step["jar"]

        args = self._opts["hadoop_bin"] + ["jar", jar]

        if step.get("main_class"):
            args.append(step["main_class"])

        # TODO: merge with logic in mrjob/emr.py
        def interpolate(arg):
            if arg == mrjob.step.JarStep.INPUT:
                return ",".join(self._hdfs_step_input_files(step_num))
            elif arg == mrjob.step.JarStep.OUTPUT:
                return self._hdfs_step_output_dir(step_num)
            else:
                return arg

        if step.get("args"):
            args.extend(interpolate(arg) for arg in step["args"])

        return args
示例#3
0
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        if not _is_spark_step_type(step['type']):
            raise TypeError('non-Spark step: %r' % step)

        args = []

        # add --master
        if self._spark_master():
            args.extend(['--master', self._spark_master()])

        # add --deploy-mode
        if self._spark_deploy_mode():
            args.extend(['--deploy-mode', self._spark_deploy_mode()])

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            args.extend(['--conf', '%s=%s' % (key, value)])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        if step['type'] in ('spark', 'spark_script'):
            py_file_uris = self._upload_uris(self._py_files())
            if py_file_uris:
                args.extend(['--py-files', ','.join(py_file_uris)])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        if step.get('spark_args'):
            args.extend(step['spark_args'])

        return args
示例#4
0
    def _substep_args(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            cmd = step[mrc]['command']

            # never wrap custom hadoop streaming commands in bash
            if isinstance(cmd, string_types):
                return shlex_split(cmd)
            else:
                return cmd

        elif step[mrc]['type'] == 'script':
            script_args = self._script_args_for_step(
                step_num, mrc, input_manifest=step.get('input_manifest'))

            if 'pre_filter' in step[mrc]:
                return self._sh_wrap(
                    '%s | %s' % (step[mrc]['pre_filter'],
                                 cmd_line(script_args)))
            else:
                return script_args
        else:
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))
示例#5
0
文件: bin.py 项目: Affirm/mrjob
    def _substep_args(self, step_num, mrc):
        step = self._get_step(step_num)

        if step[mrc]['type'] == 'command':
            cmd = step[mrc]['command']

            # never wrap custom hadoop streaming commands in bash
            if isinstance(cmd, string_types):
                return shlex_split(cmd)
            else:
                return cmd

        elif step[mrc]['type'] == 'script':
            script_args = self._script_args_for_step(
                step_num, mrc, input_manifest=step.get('input_manifest'))

            if 'pre_filter' in step[mrc]:
                return self._sh_wrap(
                    '%s | %s' % (step[mrc]['pre_filter'],
                                 cmd_line(script_args)))
            else:
                return script_args
        else:
            raise ValueError("Invalid %s step %d: %r" % (
                mrc, step_num, step[mrc]))
示例#6
0
    def _check_steps(self, steps):
        """Raise an exception if there's something wrong with the step
        definition."""
        for step_num, step in enumerate(steps):
            if step['type'] not in STEP_TYPES:
                raise ValueError('unexpected step type %r in steps %r' %
                                 (step['type'], steps))

            if step.get('input_manifest') and step_num != 0:
                raise ValueError('only first step may take an input manifest')
示例#7
0
文件: bin.py 项目: okomestudio/mrjob
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        if not _is_spark_step_type(step['type']):
            raise TypeError('non-Spark step: %r' % step)

        args = []

        # add runner-specific args
        args.extend(self._spark_submit_arg_prefix())

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            if value is not None:
                args.extend(['--conf', '%s=%s' % (key, value)])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        if step['type'] in ('spark', 'spark_script'):
            py_files_arg = ','.join(self._spark_py_files())
            if py_files_arg:
                args.extend(['--py-files', py_files_arg])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        args.extend(step['spark_args'])

        return args
示例#8
0
    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_dicts(self._sort_values_jobconf(),
                                self._opts['jobconf'], step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf
示例#9
0
    def _jobconf_for_step(self, step_num):
        """Get the jobconf dictionary, optionally including step-specific
        jobconf info.

        Also translate jobconfs to the current Hadoop version, if necessary.
        """

        step = self._get_step(step_num)

        # _sort_values_jobconf() isn't relevant to Spark,
        # but it doesn't do any harm either

        jobconf = combine_dicts(self._sort_values_jobconf(),
                                self._opts['jobconf'],
                                step.get('jobconf'))

        # if user is using the wrong jobconfs, add in the correct ones
        # and log a warning
        hadoop_version = self.get_hadoop_version()
        if hadoop_version:
            jobconf = translate_jobconf_dict(jobconf, hadoop_version)

        return jobconf
示例#10
0
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        args = []

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            if self._spark_master() == 'yarn':  # YARN only, see #1919
                jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            args.extend(['--conf', '%s=%s' % (key, value)])

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # spark-submit treats --master and --deploy-mode as aliases for
        # --conf spark.master=... and --conf spark.deploy-mode=... (see #2032).
        #
        # we never want jobconf to override spark master or deploy mode, so put
        # these switches after --conf

        # add --master
        if self._spark_master():
            args.extend(['--master', self._spark_master()])

        # add --deploy-mode
        if self._spark_deploy_mode():
            args.extend(['--deploy-mode', self._spark_deploy_mode()])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        # spark runner can run 'streaming' steps, so just exclude
        # non-Python steps
        if 'jar' not in step['type']:
            py_file_uris = self._py_files()

            if self._upload_mgr:
                # don't assume py_files are in _upload_mgr; for example,
                # spark-submit doesn't need to upload them
                path_to_uri = self._upload_mgr.path_to_uri()
                py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris]

            if py_file_uris:
                args.extend(['--py-files', ','.join(py_file_uris)])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        if step.get('spark_args'):
            args.extend(step['spark_args'])

        return args
示例#11
0
文件: bin.py 项目: Affirm/mrjob
    def _spark_submit_args(self, step_num):
        """Build a list of extra args to the spark-submit binary for
        the given spark or spark_script step."""
        step = self._get_step(step_num)

        args = []

        # add --master
        if self._spark_master():
            args.extend(['--master', self._spark_master()])

        # add --deploy-mode
        if self._spark_deploy_mode():
            args.extend(['--deploy-mode', self._spark_deploy_mode()])

        # add --class (JAR steps)
        if step.get('main_class'):
            args.extend(['--class', step['main_class']])

        # add --jars, if any
        libjar_paths = self._libjar_paths()
        if libjar_paths:
            args.extend(['--jars', ','.join(libjar_paths)])

        # --conf arguments include python bin, cmdenv, jobconf. Make sure
        # that we can always override these manually
        jobconf = {}
        for key, value in self._spark_cmdenv(step_num).items():
            jobconf['spark.executorEnv.%s' % key] = value
            jobconf['spark.yarn.appMasterEnv.%s' % key] = value

        jobconf.update(self._jobconf_for_step(step_num))

        for key, value in sorted(jobconf.items()):
            args.extend(['--conf', '%s=%s' % (key, value)])

        # --files and --archives
        args.extend(self._spark_upload_args())

        # --py-files (Python only)
        # spark runner can run 'streaming' steps, so just exclude
        # non-Python steps
        if 'jar' not in step['type']:
            py_file_uris = self._py_files()

            if self._upload_mgr:
                # don't assume py_files are in _upload_mgr; for example,
                # spark-submit doesn't need to upload them
                path_to_uri = self._upload_mgr.path_to_uri()
                py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris]

            if py_file_uris:
                args.extend(['--py-files', ','.join(py_file_uris)])

        # spark_args option
        args.extend(self._opts['spark_args'])

        # step spark_args
        if step.get('spark_args'):
            args.extend(step['spark_args'])

        return args