def test_translate_jobconf(self): assert_equal(translate_jobconf('user.name', '0.18'), 'user.name') assert_equal(translate_jobconf('mapreduce.job.user.name', '0.18'), 'user.name') assert_equal(translate_jobconf('user.name', '0.19'), 'user.name') assert_equal(translate_jobconf('mapreduce.job.user.name', '0.19.2'), 'user.name') assert_equal(translate_jobconf('user.name', '0.21'), 'mapreduce.job.user.name')
def _update_jobconf_for_hadoop_version(self, jobconf, hadoop_version): """If *jobconf* (a dict) contains jobconf variables from the wrong version of Hadoop, add variables for the right one. If *hadoop_version* is empty, do nothing. """ if not hadoop_version: # this happens for sim runner return translations = {} # for warning, below for key, value in sorted(jobconf.items()): new_key = translate_jobconf(key, hadoop_version) if new_key not in jobconf: jobconf[new_key] = value translations[key] = new_key if translations: log.warning( "Detected hadoop configuration property names that" " do not match hadoop version %s:" "\nThey have been translated as follows\n %s", hadoop_version, '\n'.join([ "%s: %s" % (key, new_key) for key, new_key in sorted(translations.items())]))
def _simulate_jobconf_for_step(self, task_type, step_num, task_num, map_split=None): j = {} # TODO: these are really poor imtations of Hadoop IDs. See #1254 j['mapreduce.job.id'] = self._job_key j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower() # TODO: is this the correct format? j['mapreduce.task.partition'] = str(task_num) j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num) working_dir = self._task_working_dir(task_type, step_num, task_num) j['mapreduce.job.local.dir'] = working_dir for x in ('archive', 'file'): named_paths = sorted(self._working_dir_mgr.name_to_path(x).items()) # mapreduce.job.cache.archives # mapreduce.job.cache.files j['mapreduce.job.cache.%ss' % x] = ','.join( '%s#%s' % (path, name) for name, path in named_paths) # mapreduce.job.cache.local.archives # mapreduce.job.cache.local.files j['mapreduce.job.cache.local.%ss' % x] = ','.join( join(working_dir, name) for name, path in named_paths) if map_split: j['mapreduce.map.input.file'] = 'file://' + map_split['file'] j['mapreduce.map.input.length'] = str(map_split['length']) j['mapreduce.map.input.start'] = str(map_split['start']) # translate to correct version # don't use translate_jobconf_dict(); that's meant to add keys # to user-supplied jobconf hadoop_version = self.get_hadoop_version() if hadoop_version: return { translate_jobconf(k, hadoop_version): v for k, v in j.items() } else: return { tk: v for k, v in j.items() for tk in translate_jobconf_for_all_versions(k) }
def _subprocess_env(self, step_type, step_num, task_num, input_file=None, input_start=None, input_length=None): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ version = self.get_hadoop_version() jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in self._opts['jobconf'].iteritems()) internal_jobconf = self._simulate_jobconf_for_step( step_type, step_num, task_num, input_file=input_file, input_start=input_start, input_length=input_length) internal_jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in internal_jobconf.iteritems()) # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs(os.environ, jobconf_env, internal_jobconf_env, self._get_cmdenv())
def _subprocess_env(self, step_type, step_num, task_num, input_file=None, input_start=None, input_length=None): """Set up environment variables for a subprocess (mapper, etc.) This combines, in decreasing order of priority: * environment variables set by the **cmdenv** option * **jobconf** environment variables set by our job (e.g. ``mapreduce.task.ismap`) * environment variables from **jobconf** options, translated to whatever version of Hadoop we're emulating * the current environment * PYTHONPATH set to current working directory We use :py:func:`~mrjob.conf.combine_local_envs`, so ``PATH`` environment variables are handled specially. """ version = self.get_hadoop_version() jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in self._opts['jobconf'].iteritems()) internal_jobconf = self._simulate_jobconf_for_step( step_type, step_num, task_num, input_file=input_file, input_start=input_start, input_length=input_length) internal_jobconf_env = dict( (translate_jobconf(k, version).replace('.', '_'), str(v)) for (k, v) in internal_jobconf.iteritems()) ironpython_env = {'IRONPYTHONPATH': os.getcwd()} if is_ironpython \ else {} # keep the current environment because we need PATH to find binaries # and make PYTHONPATH work return combine_local_envs({'PYTHONPATH': os.getcwd()}, ironpython_env, os.environ, jobconf_env, internal_jobconf_env, self._get_cmdenv())
def _simulate_jobconf_for_step( self, task_type, step_num, task_num, map_split=None): j = {} # TODO: these are really poor imtations of Hadoop IDs. See #1254 j['mapreduce.job.id'] = self._job_key j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, task_type.lower(), step_num, task_num) j['mapreduce.task.ismap'] = str(task_type == 'mapper').lower() # TODO: is this the correct format? j['mapreduce.task.partition'] = str(task_num) j['mapreduce.task.output.dir'] = self._output_dir_for_step(step_num) working_dir = self._task_working_dir(task_type, step_num, task_num) j['mapreduce.job.local.dir'] = working_dir for x in ('archive', 'file'): named_paths = sorted(self._working_dir_mgr.name_to_path(x).items()) # mapreduce.job.cache.archives # mapreduce.job.cache.files j['mapreduce.job.cache.%ss' % x] = ','.join( '%s#%s' % (path, name) for name, path in named_paths) # mapreduce.job.cache.local.archives # mapreduce.job.cache.local.files j['mapreduce.job.cache.local.%ss' % x] = ','.join( join(working_dir, name) for name, path in named_paths) if map_split: # mapreduce.map.input.file # mapreduce.map.input.start # mapreduce.map.input.length for key, value in map_split.items(): j['mapreduce.map.input.' + key] = str(value) # translate to correct version # don't use translate_jobconf_dict(); that's meant to add keys # to user-supplied jobconf hadoop_version = self.get_hadoop_version() if hadoop_version: return {translate_jobconf(k, hadoop_version): v for k, v in j.items()} else: return {tk: v for k, v in j.items() for tk in translate_jobconf_for_all_versions(k)}
def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend(self._upload_args()) # if no reducer, shut off reducer tasks. This has to come before # extra hadoop args, which could contain jar-specific args # (e.g. -outputformat). See #1331. # # might want to just integrate this into _hadoop_args_for_step? if not reducer: args.extend([ '-D', ('%s=0' % translate_jobconf('mapreduce.job.reduces', self.get_hadoop_version())) ]) # -libjars (#198) if self._opts['libjars']: args.extend(['-libjars', ','.join(self._opts['libjars'])]) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjars) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._step_input_uris(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._step_output_uri(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def test_translate_jobconf(self): self.assertEqual(translate_jobconf("user.name", "0.20"), "user.name") self.assertEqual(translate_jobconf("mapreduce.job.user.name", "0.20"), "user.name") self.assertEqual(translate_jobconf("mapreduce.job.user.name", "0.20.2"), "user.name") self.assertEqual(translate_jobconf("user.name", "0.21"), "mapreduce.job.user.name") self.assertEqual(translate_jobconf("user.name", "1.0"), "user.name") self.assertEqual(translate_jobconf("user.name", "2.0"), "mapreduce.job.user.name") self.assertEqual(translate_jobconf("foo.bar", "2.0"), "foo.bar")
def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend( self._upload_args(self._upload_mgr)) # if no reducer, shut off reducer tasks. This has to come before # extra hadoop args, which could contain jar-specific args # (e.g. -outputformat). See #1331. # # might want to just integrate this into _hadoop_args_for_step? if not reducer: args.extend(['-D', ('%s=0' % translate_jobconf( 'mapreduce.job.reduces', self.get_hadoop_version()))]) # -libjars (#198) if self._opts['libjars']: args.extend(['-libjars', ','.join(self._opts['libjars'])]) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjars) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def test_translate_jobconf(self): self.assertEqual(translate_jobconf('user.name', '0.18'), 'user.name') self.assertEqual(translate_jobconf('mapreduce.job.user.name', '0.18'), 'user.name') self.assertEqual(translate_jobconf('user.name', '0.19'), 'user.name') self.assertEqual( translate_jobconf('mapreduce.job.user.name', '0.19.2'), 'user.name') self.assertEqual(translate_jobconf('user.name', '0.21'), 'mapreduce.job.user.name') self.assertEqual(translate_jobconf('user.name', '1.0'), 'user.name') self.assertEqual(translate_jobconf('user.name', '2.0'), 'mapreduce.job.user.name') self.assertEqual(translate_jobconf('foo.bar', '2.0'), 'foo.bar')
def _hadoop_streaming_jar_args(self, step_num): """The arguments that come after ``hadoop jar <streaming jar path>`` when running a Hadoop streaming job.""" args = [] # get command for each part of the job mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) # set up uploading from HDFS/cloud storage to the working dir args.extend(self._upload_args()) # if no reducer, shut off reducer tasks. This has to come before # extra hadoop args, which could contain jar-specific args # (e.g. -outputformat). See #1331. # # might want to just integrate this into _hadoop_args_for_step? if not reducer: args.extend([ '-D', ('%s=0' % translate_jobconf('mapreduce.job.reduces', self.get_hadoop_version())) ]) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._step_input_uris(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._step_output_uri(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def _hadoop_streaming_jar_args(self, step_num): """The arguments that come after ``hadoop jar <streaming jar path>`` when running a Hadoop streaming job.""" args = [] # get command for each part of the job mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) # set up uploading from HDFS/cloud storage to the working dir args.extend(self._upload_args()) # if no reducer, shut off reducer tasks. This has to come before # extra hadoop args, which could contain jar-specific args # (e.g. -outputformat). See #1331. # # might want to just integrate this into _hadoop_args_for_step? if not reducer: args.extend(['-D', ('%s=0' % translate_jobconf( 'mapreduce.job.reduces', self.get_hadoop_version()))]) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._step_input_uris(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._step_output_uri(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend(self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = (self._hadoop_streaming_commands(step_num)) # if no reducer, shut off reducer tasks if not reducer: args.extend([ '-D', ('%s=0' % translate_jobconf('mapreduce.job.reduces', self.get_hadoop_version())) ]) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def _args_for_streaming_step(self, step_num): hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # set up uploading from HDFS to the working dir args.extend( self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) # if no reducer, shut off reducer tasks if not reducer: args.extend(['-D', ('%s=0' % translate_jobconf( 'mapreduce.job.reduces', self.get_hadoop_version()))]) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) return args
def _sort_values_jobconf(self): """Jobconf dictionary to enable sorting by value. """ if not self._sort_values: return {} # translate _SORT_VALUES_JOBCONF to the correct Hadoop version, # without logging a warning hadoop_version = self.get_hadoop_version() jobconf = {} for k, v in _SORT_VALUES_JOBCONF.items(): if hadoop_version: jobconf[translate_jobconf(k, hadoop_version)] = v else: for j in translate_jobconf_for_all_versions(k): jobconf[j] = v return jobconf
def _process_jobconf_args(self, jobconf): if jobconf: for (conf_arg, value) in jobconf.iteritems(): # Internally, use one canonical Hadoop version canon_arg = translate_jobconf(conf_arg, '0.21') if canon_arg == 'mapreduce.job.maps': self._map_tasks = int(value) if self._map_tasks < 1: raise ValueError('%s should be at least 1' % conf_arg) elif canon_arg == 'mapreduce.job.reduces': self._reduce_tasks = int(value) if self._reduce_tasks < 1: raise ValueError('%s should be at least 1' % conf_arg) elif canon_arg == 'mapreduce.job.local.dir': # Hadoop supports multiple direcories. Sticking with only # one here if not os.path.isdir(value): raise IOError("Directory %s does not exist" % value) self._working_dir = value
def _process_jobconf_args(self, jobconf): if jobconf: for (conf_arg, value) in jobconf.iteritems(): # Internally, use one canonical Hadoop version canon_arg = translate_jobconf(conf_arg, '0.21') if canon_arg == 'mapreduce.job.maps': self._map_tasks = int(value) if self._map_tasks < 1: raise ValueError( '%s should be at least 1' % conf_arg) elif canon_arg == 'mapreduce.job.reduces': self._reduce_tasks = int(value) if self._reduce_tasks < 1: raise ValueError('%s should be at least 1' % conf_arg) elif canon_arg == 'mapreduce.job.local.dir': # Hadoop supports multiple direcories. Sticking with only # one here if not os.path.isdir(value): raise IOError("Directory %s does not exist" % value) self._working_dir = value
def _simulate_jobconf_for_step(self, step_num, step_type, task_num, working_dir, input_file=None, input_start=None, input_length=None): """Simulate jobconf variables set by Hadoop to indicate input files, files uploaded, working directory, etc. for a particular step. Returns a dictionary mapping jobconf variable name (e.g. ``'mapreduce.map.input.file'``) to its value, which is always a string. """ # By convention, we use the newer (Hadoop 2) jobconf names and # translate them at the very end. j = {} j['mapreduce.job.id'] = self._job_key j['mapreduce.task.output.dir'] = self._output_dir j['mapreduce.job.local.dir'] = working_dir # archives and files for jobconf cache_archives = [] cache_files = [] cache_local_archives = [] cache_local_files = [] files = self._working_dir_mgr.name_to_path('file').items() for name, path in files: cache_files.append('%s#%s' % (path, name)) cache_local_files.append(os.path.join(working_dir, name)) archives = self._working_dir_mgr.name_to_path('archive').items() for name, path in archives: cache_archives.append('%s#%s' % (path, name)) cache_local_archives.append(os.path.join(working_dir, name)) # TODO: could add mtime info here too (e.g. # mapreduce.job.cache.archives.timestamps) here too j['mapreduce.job.cache.files'] = (','.join(cache_files)) j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files)) j['mapreduce.job.cache.archives'] = (','.join(cache_archives)) j['mapreduce.job.cache.local.archives'] = ( ','.join(cache_local_archives)) # task and attempt IDs # TODO: these are a crappy imitation of task/attempt IDs (see #1254) j['mapreduce.task.id'] = 'task_%s_%s_%04d%d' % ( self._job_key, step_type.lower(), step_num, task_num) # (we only have one attempt) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%04d%d_0' % ( self._job_key, step_type.lower(), step_num, task_num) # not actually sure what's correct for combiners here. It'll definitely # be true if we're just using pipes to simulate a combiner though j['mapreduce.task.ismap'] = str(step_type in ('mapper', 'combiner')).lower() j['mapreduce.task.partition'] = str(task_num) if input_file is not None: j['mapreduce.map.input.file'] = input_file if input_start is not None: j['mapreduce.map.input.start'] = str(input_start) if input_length is not None: j['mapreduce.map.input.length'] = str(input_length) version = self.get_hadoop_version() if version: # translate to correct version j = dict((translate_jobconf(k, version), v) for k, v in j.items()) else: # use all versions j = dict((variant, v) for k, v in j.items() for variant in translate_jobconf_for_all_versions(k)) return j
def _simulate_jobconf_for_step( self, step_num, step_type, task_num, working_dir, input_file=None, input_start=None, input_length=None): """Simulate jobconf variables set by Hadoop to indicate input files, files uploaded, working directory, etc. for a particular step. Returns a dictionary mapping jobconf variable name (e.g. ``'mapreduce.map.input.file'``) to its value, which is always a string. """ # By convention, we use the newer (Hadoop 0.21+) jobconf names and # translate them at the very end. j = {} j['mapreduce.job.id'] = self._job_name j['mapreduce.task.output.dir'] = self._output_dir j['mapreduce.job.local.dir'] = working_dir # archives and files for jobconf cache_archives = [] cache_files = [] cache_local_archives = [] cache_local_files = [] files = self._working_dir_mgr.name_to_path('file').iteritems() for name, path in files: cache_files.append('%s#%s' % (path, name)) cache_local_files.append(os.path.join(working_dir, name)) archives = self._working_dir_mgr.name_to_path('archive').iteritems() for name, path in archives: cache_archives.append('%s#%s' % (path, name)) cache_local_archives.append(os.path.join(working_dir, name)) # TODO: could add mtime info here too (e.g. # mapreduce.job.cache.archives.timestamps) here too j['mapreduce.job.cache.files'] = (','.join(cache_files)) j['mapreduce.job.cache.local.files'] = (','.join(cache_local_files)) j['mapreduce.job.cache.archives'] = (','.join(cache_archives)) j['mapreduce.job.cache.local.archives'] = ( ','.join(cache_local_archives)) # task and attempt IDs j['mapreduce.task.id'] = 'task_%s_%s_%05d%d' % ( self._job_name, step_type.lower(), step_num, task_num) # (we only have one attempt) j['mapreduce.task.attempt.id'] = 'attempt_%s_%s_%05d%d_0' % ( self._job_name, step_type.lower(), step_num, task_num) # not actually sure what's correct for combiners here. It'll definitely # be true if we're just using pipes to simulate a combiner though j['mapreduce.task.ismap'] = str( step_type in ('mapper', 'combiner')).lower() j['mapreduce.task.partition'] = str(task_num) if input_file is not None: j['mapreduce.map.input.file'] = input_file if input_start is not None: j['mapreduce.map.input.start'] = str(input_start) if input_length is not None: j['mapreduce.map.input.length'] = str(input_length) # translate to correct version version = self.get_hadoop_version() j = dict((translate_jobconf(k, version), v) for k, v in j.iteritems()) return j