def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step['jar'].startswith('file:///'): jar = step['jar'][7:] # keep leading slash else: jar = step['jar'] args = (self._opts['hadoop_bin'] + ['jar', jar]) if step.get('main_class'): args.append(step['main_class']) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ','.join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get('args'): args.extend(interpolate(arg) for arg in step['args']) return args
def _args_for_jar_step(self, step_num): step = self._get_step(step_num) # special case for consistency with EMR runner. # # This might look less like duplicated code if we ever # implement #780 (fetching jars from URIs) if step["jar"].startswith("file:///"): jar = step["jar"][7:] # keep leading slash else: jar = step["jar"] args = self._opts["hadoop_bin"] + ["jar", jar] if step.get("main_class"): args.append(step["main_class"]) # TODO: merge with logic in mrjob/emr.py def interpolate(arg): if arg == mrjob.step.JarStep.INPUT: return ",".join(self._hdfs_step_input_files(step_num)) elif arg == mrjob.step.JarStep.OUTPUT: return self._hdfs_step_output_dir(step_num) else: return arg if step.get("args"): args.extend(interpolate(arg) for arg in step["args"]) return args
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) if not _is_spark_step_type(step['type']): raise TypeError('non-Spark step: %r' % step) args = [] # add --master if self._spark_master(): args.extend(['--master', self._spark_master()]) # add --deploy-mode if self._spark_deploy_mode(): args.extend(['--deploy-mode', self._spark_deploy_mode()]) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): args.extend(['--conf', '%s=%s' % (key, value)]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) if step['type'] in ('spark', 'spark_script'): py_file_uris = self._upload_uris(self._py_files()) if py_file_uris: args.extend(['--py-files', ','.join(py_file_uris)]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args if step.get('spark_args'): args.extend(step['spark_args']) return args
def _substep_args(self, step_num, mrc): step = self._get_step(step_num) if step[mrc]['type'] == 'command': cmd = step[mrc]['command'] # never wrap custom hadoop streaming commands in bash if isinstance(cmd, string_types): return shlex_split(cmd) else: return cmd elif step[mrc]['type'] == 'script': script_args = self._script_args_for_step( step_num, mrc, input_manifest=step.get('input_manifest')) if 'pre_filter' in step[mrc]: return self._sh_wrap( '%s | %s' % (step[mrc]['pre_filter'], cmd_line(script_args))) else: return script_args else: raise ValueError("Invalid %s step %d: %r" % ( mrc, step_num, step[mrc]))
def _check_steps(self, steps): """Raise an exception if there's something wrong with the step definition.""" for step_num, step in enumerate(steps): if step['type'] not in STEP_TYPES: raise ValueError('unexpected step type %r in steps %r' % (step['type'], steps)) if step.get('input_manifest') and step_num != 0: raise ValueError('only first step may take an input manifest')
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) if not _is_spark_step_type(step['type']): raise TypeError('non-Spark step: %r' % step) args = [] # add runner-specific args args.extend(self._spark_submit_arg_prefix()) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): if value is not None: args.extend(['--conf', '%s=%s' % (key, value)]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) if step['type'] in ('spark', 'spark_script'): py_files_arg = ','.join(self._spark_py_files()) if py_files_arg: args.extend(['--py-files', py_files_arg]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args args.extend(step['spark_args']) return args
def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) # _sort_values_jobconf() isn't relevant to Spark, # but it doesn't do any harm either jobconf = combine_dicts(self._sort_values_jobconf(), self._opts['jobconf'], step.get('jobconf')) # if user is using the wrong jobconfs, add in the correct ones # and log a warning hadoop_version = self.get_hadoop_version() if hadoop_version: jobconf = translate_jobconf_dict(jobconf, hadoop_version) return jobconf
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) args = [] # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value if self._spark_master() == 'yarn': # YARN only, see #1919 jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): args.extend(['--conf', '%s=%s' % (key, value)]) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # spark-submit treats --master and --deploy-mode as aliases for # --conf spark.master=... and --conf spark.deploy-mode=... (see #2032). # # we never want jobconf to override spark master or deploy mode, so put # these switches after --conf # add --master if self._spark_master(): args.extend(['--master', self._spark_master()]) # add --deploy-mode if self._spark_deploy_mode(): args.extend(['--deploy-mode', self._spark_deploy_mode()]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) # spark runner can run 'streaming' steps, so just exclude # non-Python steps if 'jar' not in step['type']: py_file_uris = self._py_files() if self._upload_mgr: # don't assume py_files are in _upload_mgr; for example, # spark-submit doesn't need to upload them path_to_uri = self._upload_mgr.path_to_uri() py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris] if py_file_uris: args.extend(['--py-files', ','.join(py_file_uris)]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args if step.get('spark_args'): args.extend(step['spark_args']) return args
def _spark_submit_args(self, step_num): """Build a list of extra args to the spark-submit binary for the given spark or spark_script step.""" step = self._get_step(step_num) args = [] # add --master if self._spark_master(): args.extend(['--master', self._spark_master()]) # add --deploy-mode if self._spark_deploy_mode(): args.extend(['--deploy-mode', self._spark_deploy_mode()]) # add --class (JAR steps) if step.get('main_class'): args.extend(['--class', step['main_class']]) # add --jars, if any libjar_paths = self._libjar_paths() if libjar_paths: args.extend(['--jars', ','.join(libjar_paths)]) # --conf arguments include python bin, cmdenv, jobconf. Make sure # that we can always override these manually jobconf = {} for key, value in self._spark_cmdenv(step_num).items(): jobconf['spark.executorEnv.%s' % key] = value jobconf['spark.yarn.appMasterEnv.%s' % key] = value jobconf.update(self._jobconf_for_step(step_num)) for key, value in sorted(jobconf.items()): args.extend(['--conf', '%s=%s' % (key, value)]) # --files and --archives args.extend(self._spark_upload_args()) # --py-files (Python only) # spark runner can run 'streaming' steps, so just exclude # non-Python steps if 'jar' not in step['type']: py_file_uris = self._py_files() if self._upload_mgr: # don't assume py_files are in _upload_mgr; for example, # spark-submit doesn't need to upload them path_to_uri = self._upload_mgr.path_to_uri() py_file_uris = [path_to_uri.get(p, p) for p in py_file_uris] if py_file_uris: args.extend(['--py-files', ','.join(py_file_uris)]) # spark_args option args.extend(self._opts['spark_args']) # step spark_args if step.get('spark_args'): args.extend(step['spark_args']) return args