def test_uses_generic_jobconf(self): self.assertEqual(uses_generic_jobconf('0.18'), False) self.assertEqual(uses_generic_jobconf('0.20'), True) self.assertEqual(uses_generic_jobconf('0.21'), True) # default to True self.assertEqual(uses_generic_jobconf(None), True)
def _hadoop_conf_args(self, step, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): args.extend(['-D', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._get_cmdenv().iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) # old-style jobconf if not uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): args.extend(['-jobconf', '%s=%s' % (key, value)]) return args
def _hadoop_conf_args(self, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() if compat.uses_generic_jobconf(version): for key, value in sorted(self._opts['jobconf'].iteritems()): args.extend(['-D', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._get_cmdenv().iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) # old-style jobconf if not compat.uses_generic_jobconf(version): for key, value in sorted(self._opts['jobconf'].iteritems()): args.extend(['-jobconf', '%s=%s' % (key, value)]) return args
def _hadoop_conf_args(self, step, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = add_translated_jobconf_for_hadoop_version(jobconf, version) if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.iteritems()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args
def _hadoop_args_for_step(self, step_num): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < self._num_steps() args = [] # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = self._jobconf_for_step(step_num) if uses_generic_jobconf(version): for key, value in sorted(jobconf.items()): if value is not None: args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.items()): if value is not None: args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].items()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == self._num_steps() - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args
def _hadoop_conf_args(self, step, step_num, num_steps): """Build a list of extra arguments to the hadoop binary. This handles *cmdenv*, *hadoop_extra_args*, *hadoop_input_format*, *hadoop_output_format*, *jobconf*, and *partitioner*. This doesn't handle input, output, mappers, reducers, or uploading files. """ assert 0 <= step_num < num_steps args = [] jobconf = combine_dicts(self._opts['jobconf'], step.get('jobconf')) job_name = jobconf.get('mapred.job.name', None) # Set a default job name if not job_name: job_name = "%s > %s" % (self._script_path, self._output_dir) # Add the step into the job name if num_steps > 1: job_name = "%s (step %s of %s)" % (job_name, step_num + 1, num_steps) jobconf['mapred.job.name'] = job_name # hadoop_extra_args args.extend(self._opts['hadoop_extra_args']) # new-style jobconf version = self.get_hadoop_version() # translate the jobconf configuration names to match # the hadoop version jobconf = add_translated_jobconf_for_hadoop_version(jobconf, version) if uses_generic_jobconf(version): for key, value in sorted(jobconf.iteritems()): args.extend(['-D', '%s=%s' % (key, value)]) # old-style jobconf else: for key, value in sorted(jobconf.iteritems()): args.extend(['-jobconf', '%s=%s' % (key, value)]) # partitioner if self._partitioner: args.extend(['-partitioner', self._partitioner]) # cmdenv for key, value in sorted(self._opts['cmdenv'].iteritems()): args.append('-cmdenv') args.append('%s=%s' % (key, value)) # hadoop_input_format if (step_num == 0 and self._hadoop_input_format): args.extend(['-inputformat', self._hadoop_input_format]) # hadoop_output_format if (step_num == num_steps - 1 and self._hadoop_output_format): args.extend(['-outputformat', self._hadoop_output_format]) return args
def test_uses_generic_jobconf(self): assert_equal(uses_generic_jobconf('0.18'), False) assert_equal(uses_generic_jobconf('0.20'), True) assert_equal(uses_generic_jobconf('0.21'), True)