def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts["jobconf"], step.get("jobconf")) return add_translated_jobconf_for_hadoop_version(jobconf, self.get_hadoop_version())
def _jobconf_for_step(self, step_num): """Get the jobconf dictionary, optionally including step-specific jobconf info. Also translate jobconfs to the current Hadoop version, if necessary. """ step = self._get_step(step_num) jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) return add_translated_jobconf_for_hadoop_version( jobconf, self.get_hadoop_version())
def _hadoop_conf_args(self, step, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = add_translated_jobconf_for_hadoop_version(jobconf, version) if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args
def _subprocess_env(self, step_num, step_type, task_num, working_dir, **split_kwargs): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ version = self.get_hadoop_version() # auto-translate jobconf variables from the wrong version, like # other runners do user_jobconf = add_translated_jobconf_for_hadoop_version( self._jobconf_for_step(step_num), version) simulated_jobconf = self._simulate_jobconf_for_step( step_num, step_type, task_num, working_dir, **split_kwargs) def to_env(jobconf): return dict((k.replace('.', '_'), str(v)) for k, v in jobconf.iteritems()) # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs(os.environ, to_env(user_jobconf), to_env(simulated_jobconf), self._opts['cmdenv'])