def test_supports_combiners(self): self.assertEqual(supports_combiners_in_hadoop_streaming('0.19'), False) self.assertEqual(supports_combiners_in_hadoop_streaming('0.19.2'), False) self.assertEqual(supports_combiners_in_hadoop_streaming('0.20'), True) self.assertEqual(supports_combiners_in_hadoop_streaming('0.20.203'), True)
def test_supports_combiners(self): self.assertEqual(supports_combiners_in_hadoop_streaming('0.19'), False) self.assertEqual(supports_combiners_in_hadoop_streaming('0.19.2'), False) self.assertEqual(supports_combiners_in_hadoop_streaming('0.20'), True) self.assertEqual(supports_combiners_in_hadoop_streaming('0.20.203'), True)
def _hadoop_streaming_commands(self, step_num): version = self.get_hadoop_version() # Hadoop streaming stuff mapper, bash_wrap_mapper = self._render_substep( step_num, 'mapper') combiner, bash_wrap_combiner = self._render_substep( step_num, 'combiner') reducer, bash_wrap_reducer = self._render_substep( step_num, 'reducer') if (combiner is not None and not supports_combiners_in_hadoop_streaming(version)): # krazy hack to support combiners on hadoop <0.20 bash_wrap_mapper = True mapper = "%s | sort | %s" % (mapper, combiner) # take the combiner away, hadoop will just be confused combiner = None bash_wrap_combiner = False if bash_wrap_mapper: mapper = bash_wrap(mapper) if bash_wrap_combiner: combiner = bash_wrap(combiner) if bash_wrap_reducer: reducer = bash_wrap(reducer) return mapper, combiner, reducer
def _run_job_in_hadoop(self): # figure out local names for our files self._name_files() # send script and wrapper script (if any) to working dir assert self._script # shouldn't be able to run if no script self._script['upload'] = 'file' if self._wrapper_script: self._wrapper_script['upload'] = 'file' self._counters = [] steps = self._get_steps() version = self.get_hadoop_version() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if compat.supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._upload_args()) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend( self._hadoop_conf_args(step_num, len(steps))) # set up input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # set up output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not compat.supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._upload_args()) # set up mapper and reducer if 'M' not in step: mapper = 'cat' else: mapper = cmd_line(self._mapper_args(step_num)) if 'C' in step: combiner_cmd = cmd_line(self._combiner_args(step_num)) version = self.get_hadoop_version() if compat.supports_combiners_in_hadoop_streaming(version): combiner = combiner_cmd else: mapper = ("bash -c '%s | sort | %s'" % (mapper, combiner_cmd)) combiner = None else: combiner = None streaming_args.append('-mapper') streaming_args.append(mapper) if combiner: streaming_args.append('-combiner') streaming_args.append(combiner) if 'R' in step: streaming_args.append('-reducer') streaming_args.append(cmd_line(self._reducer_args(step_num))) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) log.debug('> %s' % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (step_proc.returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args)