def test_cache_opts(self): self.assertEqual(supports_new_distributed_cache_options('0.18'), False) self.assertEqual(supports_new_distributed_cache_options('0.20'), False) self.assertEqual(supports_new_distributed_cache_options('0.20.203'), True) # default to True self.assertEqual(supports_new_distributed_cache_options(None), True)
def test_cache_opts(self): self.assertEqual(supports_new_distributed_cache_options('0.18'), False) self.assertEqual(supports_new_distributed_cache_options('0.20'), False) self.assertEqual( supports_new_distributed_cache_options('0.20.203'), True) # default to True self.assertEqual( supports_new_distributed_cache_options(None), True)
def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() hadoop_streaming_jar = self.get_hadoop_streaming_jar() if not hadoop_streaming_jar: raise Exception('no Hadoop streaming jar') args = self.get_hadoop_bin() + ['jar', hadoop_streaming_jar] # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._pre_0_20_upload_args(self._upload_mgr)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) else: args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return args
def _streaming_args(self, step, step_num, num_steps): version = self.get_hadoop_version() streaming_args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend( self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend( self._hadoop_conf_args(step, step_num, num_steps)) # set up input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # set up output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend( self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step, step_num)) streaming_args.append('-mapper') streaming_args.append(mapper) if combiner: streaming_args.append('-combiner') streaming_args.append(combiner) if reducer: streaming_args.append('-reducer') streaming_args.append(reducer) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return streaming_args
def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(['-input', input_uri]) # set up output args.append('-output') args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend( self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = ( self._hadoop_streaming_commands(step_num)) args.append('-mapper') args.append(mapper) if combiner: args.append('-combiner') args.append(combiner) if reducer: args.append('-reducer') args.append(reducer) else: args.extend(['-jobconf', 'mapred.reduce.tasks=0']) return args
def _args_for_streaming_step(self, step_num): version = self.get_hadoop_version() args = self._opts["hadoop_bin"] + ["jar", self._opts["hadoop_streaming_jar"]] # -files/-archives (generic options, new-style) if supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._new_upload_args(self._upload_mgr)) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. args.extend(self._hadoop_args_for_step(step_num)) # set up input for input_uri in self._hdfs_step_input_files(step_num): args.extend(["-input", input_uri]) # set up output args.append("-output") args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir args.extend(self._old_upload_args(self._upload_mgr)) mapper, combiner, reducer = self._hadoop_streaming_commands(step_num) args.append("-mapper") args.append(mapper) if combiner: args.append("-combiner") args.append(combiner) if reducer: args.append("-reducer") args.append(reducer) else: args.extend(["-jobconf", "mapred.reduce.tasks=0"]) return args
def _upload_args(self): """Args to upload files from HDFS to the hadoop nodes.""" args = [] version = self.get_hadoop_version() if compat.supports_new_distributed_cache_options(version): # return list of strings ready for comma-joining for passing to the # hadoop binary def escaped_paths(file_dicts): return [ "%s#%s" % (fd['hdfs_uri'], fd['name']) for fd in file_dicts ] # index by type all_files = {} for fd in self._files: all_files.setdefault(fd.get('upload'), []).append(fd) if 'file' in all_files: args.append('-files') args.append(','.join(escaped_paths(all_files['file']))) if 'archive' in all_files: args.append('-archives') args.append(','.join(escaped_paths(all_files['archive']))) else: for file_dict in self._files: if file_dict.get('upload') == 'file': args.append('-cacheFile') args.append('%s#%s' % (file_dict['hdfs_uri'], file_dict['name'])) elif file_dict.get('upload') == 'archive': args.append('-cacheArchive') args.append('%s#%s' % (file_dict['hdfs_uri'], file_dict['name'])) return args
def _upload_args(self): """Args to upload files from HDFS to the hadoop nodes.""" args = [] version = self.get_hadoop_version() if compat.supports_new_distributed_cache_options(version): # return list of strings ready for comma-joining for passing to the # hadoop binary def escaped_paths(file_dicts): return ["%s#%s" % (fd['hdfs_uri'], fd['name']) for fd in file_dicts] # index by type all_files = {} for fd in self._files: all_files.setdefault(fd.get('upload'), []).append(fd) if 'file' in all_files: args.append('-files') args.append(','.join(escaped_paths(all_files['file']))) if 'archive' in all_files: args.append('-archives') args.append(','.join(escaped_paths(all_files['archive']))) else: for file_dict in self._files: if file_dict.get('upload') == 'file': args.append('-cacheFile') args.append( '%s#%s' % (file_dict['hdfs_uri'], file_dict['name'])) elif file_dict.get('upload') == 'archive': args.append('-cacheArchive') args.append( '%s#%s' % (file_dict['hdfs_uri'], file_dict['name'])) return args
def _upload_args(self): """Args to upload files from HDFS to the hadoop nodes.""" args = [] version = self.get_hadoop_version() if compat.supports_new_distributed_cache_options(version): # return list of strings ready for comma-joining for passing to the # hadoop binary def escaped_paths(file_dicts): return ["%s#%s" % (fd["hdfs_uri"], fd["name"]) for fd in file_dicts] # index by type all_files = {} for fd in self._files: all_files.setdefault(fd.get("upload"), []).append(fd) if "file" in all_files: args.append("-files") args.append(",".join(escaped_paths(all_files["file"]))) if "archive" in all_files: args.append("-archives") args.append(",".join(escaped_paths(all_files["archive"]))) else: for file_dict in self._files: if file_dict.get("upload") == "file": args.append("-cacheFile") args.append("%s#%s" % (file_dict["hdfs_uri"], file_dict["name"])) elif file_dict.get("upload") == "archive": args.append("-cacheArchive") args.append("%s#%s" % (file_dict["hdfs_uri"], file_dict["name"])) return args
def _run_job_in_hadoop(self): # figure out local names for our files self._name_files() # send script and wrapper script (if any) to working dir assert self._script # shouldn't be able to run if no script self._script['upload'] = 'file' if self._wrapper_script: self._wrapper_script['upload'] = 'file' self._counters = [] steps = self._get_steps() version = self.get_hadoop_version() for step_num, step in enumerate(steps): log.debug('running step %d of %d' % (step_num + 1, len(steps))) streaming_args = (self._opts['hadoop_bin'] + ['jar', self._opts['hadoop_streaming_jar']]) # -files/-archives (generic options, new-style) if compat.supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._upload_args()) # Add extra hadoop args first as hadoop args could be a hadoop # specific argument (e.g. -libjar) which must come before job # specific args. streaming_args.extend( self._hadoop_conf_args(step_num, len(steps))) # set up input for input_uri in self._hdfs_step_input_files(step_num): streaming_args.extend(['-input', input_uri]) # set up output streaming_args.append('-output') streaming_args.append(self._hdfs_step_output_dir(step_num)) # -cacheFile/-cacheArchive (streaming options, old-style) if not compat.supports_new_distributed_cache_options(version): # set up uploading from HDFS to the working dir streaming_args.extend(self._upload_args()) # set up mapper and reducer if 'M' not in step: mapper = 'cat' else: mapper = cmd_line(self._mapper_args(step_num)) if 'C' in step: combiner_cmd = cmd_line(self._combiner_args(step_num)) version = self.get_hadoop_version() if compat.supports_combiners_in_hadoop_streaming(version): combiner = combiner_cmd else: mapper = ("bash -c '%s | sort | %s'" % (mapper, combiner_cmd)) combiner = None else: combiner = None streaming_args.append('-mapper') streaming_args.append(mapper) if combiner: streaming_args.append('-combiner') streaming_args.append(combiner) if 'R' in step: streaming_args.append('-reducer') streaming_args.append(cmd_line(self._reducer_args(step_num))) else: streaming_args.extend(['-jobconf', 'mapred.reduce.tasks=0']) log.debug('> %s' % cmd_line(streaming_args)) step_proc = Popen(streaming_args, stdout=PIPE, stderr=PIPE) # TODO: use a pty or something so that the hadoop binary # won't buffer the status messages self._process_stderr_from_streaming(step_proc.stderr) # there shouldn't be much output to STDOUT for line in step_proc.stdout: log.error('STDOUT: ' + line.strip('\n')) returncode = step_proc.wait() if returncode == 0: # parsing needs step number for whole job self._fetch_counters([step_num + self._start_step_num]) # printing needs step number relevant to this run of mrjob self.print_counters([step_num + 1]) else: msg = ('Job failed with return code %d: %s' % (step_proc.returncode, streaming_args)) log.error(msg) # look for a Python traceback cause = self._find_probable_cause_of_failure( [step_num + self._start_step_num]) if cause: # log cause, and put it in exception cause_msg = [] # lines to log and put in exception cause_msg.append('Probable cause of failure (from %s):' % cause['log_file_uri']) cause_msg.extend(line.strip('\n') for line in cause['lines']) if cause['input_uri']: cause_msg.append('(while reading from %s)' % cause['input_uri']) for line in cause_msg: log.error(line) # add cause_msg to exception message msg += '\n' + '\n'.join(cause_msg) + '\n' raise Exception(msg) raise CalledProcessError(step_proc.returncode, streaming_args)