def create_filesystem(self, opts): hadoopopt = getopt(opts, 'hadoop', delete=False) hadoopshort = hadoopopt[0] hadoopdir = findhadoop(hadoopopt[0]) allopts = configopts('streaming') allopts += configopts('streaming_' + hadoopshort) streamingjar = getopt(allopts, 'streamingjar') if streamingjar: streamingjar = streamingjar[0] return StreamingFileSystem(hadoopdir, streamingjar)
def create_filesystem(self, opts): # are we given a specific shell? shell = getopt(opts, "shell", delete=False) if shell: return UnixFileSystem(shell[0]) else: return UnixFileSystem()
def cat(self, path, opts): addedopts = getopts(opts, ['libjar'], delete=False) streamingjar = findjar(self.hadoop, 'streaming') if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], shortcuts=dict(configopts('jars'))) try: import typedbytes ls = os.popen('%s %s dfs -ls %s' % (hadenv, self.hdfs, path)) if sum(c in path for c in ("*", "?", "{")) > 0: # cat each file separately when the path contains special chars lineparts = (line.split()[-1] for line in ls) subpaths = [part for part in lineparts if part.startswith("/")] else: # we still do the ls even in this case to make sure we print errors subpaths = [path] ls.close() for subpath in subpaths: if subpath.endswith("/_logs"): continue dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null' % (hadenv, self.hadoop, streamingjar, subpath)) ascodeopt = getopt(opts, 'ascode') if ascodeopt and ascodeopt[0] == 'yes': outputs = dumpcode(typedbytes.PairedInput(dumptb)) else: outputs = dumptext(typedbytes.PairedInput(dumptb)) for output in outputs: print '\t'.join(output) dumptb.close() except IOError: pass # ignore return 0
def cat(self, path, opts): addedopts = getopts(opts, ['libjar'], delete=False) streamingjar = findjar(self.hadoop, 'streaming') if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], shortcuts=dict(configopts('jars'))) try: import typedbytes ls = os.popen('%s %s/bin/hadoop dfs -ls %s' % (hadenv, self.hadoop, path)) if sum(c in path for c in ("*", "?", "{")) > 0: # cat each file separately when the path contains special chars lineparts = (line.split()[-1] for line in ls) subpaths = [part for part in lineparts if part.startswith("/")] else: # we still do the ls even in this case to make sure we print errors subpaths = [path] ls.close() for subpath in subpaths: dumptb = os.popen('%s %s/bin/hadoop jar %s dumptb %s 2> /dev/null' % (hadenv, self.hadoop, streamingjar, subpath)) ascodeopt = getopt(opts, 'ascode') if ascodeopt and ascodeopt[0] == 'yes': outputs = dumpcode(typedbytes.PairedInput(dumptb)) else: outputs = dumptext(typedbytes.PairedInput(dumptb)) for output in outputs: print '\t'.join(output) dumptb.close() except IOError: pass # ignore return 0
def test_getopt(self): # Test for backward compatibility opts = [] values = getopt(opts, 'input') self.assertEquals(values, []) self.assertEquals(opts, []) opts = [('param', 'p1'), ('param', 'p2'), ('input', '/dev/path')] values = getopt(opts, 'param') expected = ['p2', 'p1'] self.assertEquals(set(values), set(expected)) self.assertEquals(set(opts), set([('input', '/dev/path')])) opts = [('output', '/prod/path')] values = getopt(opts, 'output', delete=False) self.assertEquals(values, ['/prod/path']) self.assertEquals(opts, [('output', '/prod/path')]) values = getopt(opts, 'output') self.assertEquals(values, ['/prod/path']) self.assertEquals(opts, [])
def launch(self, mapper, reducer=None, combiner=None, opts=None, *args, **kwargs): "Copied from dumbo.core.run" if not opts: opts = [] if type(mapper) == str: opts.append(('mapper', mapper)) elif hasattr(mapper, 'opts'): opts += mapper.opts if type(reducer) == str: opts.append(('reducer', reducer)) elif hasattr(reducer, 'opts'): opts += reducer.opts if type(combiner) == str: opts.append(('combiner', combiner)) opts += [ ('param', 'FLOW_INPUTS=%s' % ';'.join(self.flow.inputs)), ('param', 'FLOW_OUTPUTS=%s' % ';'.join(self.flow.outputs)), ] opts += self.flow.opts opts = override_opts(opts, self.get_connect_opts()) if not reducer: opts.append(('numreducetasks','0')) progopt = getopt(opts, 'prog') hadoopopt = getopt(opts, 'hadoop', delete=False) if hadoopopt: retval = StreamingIteration(progopt[0], opts).run() else: retval = UnixIteration(progopt[0], opts).run() if retval == 127: print >> sys.stderr, 'ERROR: Are you sure that "python" is on your path?' if retval != 0: sys.exit(retval)
def matches(self, opts): return bool(getopt(opts, "hadoop", delete=False))
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(('file', self.prog)) addedopts = getopts(self.opts, ['hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'libjarstreaming', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath']) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = getopt(self.opts,'streamingjar') if streamingjar is None or len(streamingjar)==0: streamingjar = findjar(hadoop,'streaming') else: streamingjar = streamingjar[0] if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 # add typedbytes to path try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) else: self.opts.append(('file', modpath)) # add ctypedbytes to job try: import ctypedbytes print >>sys.stderr, 'INFO: "ctypedbytes" found!' modpath = re.sub('\.egg.*$', '.egg', ctypedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) except ImportError: pass self.opts.append(('jobconf', 'stream.map.input=typedbytes')) self.opts.append(('jobconf', 'stream.reduce.input=typedbytes')) if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.map.output=' + id_)) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.reduce.output=' + id_)) else: self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if not addedopts['name']: self.opts.append(('jobconf', 'mapred.job.name=' + self.prog.split('/')[-1])) else: self.opts.append(('jobconf', 'mapred.job.name=%s' % addedopts['name'][0])) if addedopts['nummaptasks']: self.opts.append(('jobconf', 'mapred.map.tasks=%s' % addedopts['nummaptasks'][0])) if addedopts['numreducetasks']: numreducetasks = int(addedopts['numreducetasks'][0]) self.opts.append(('numReduceTasks', str(numreducetasks))) if addedopts['priority']: self.opts.append(('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0])) if addedopts['queue']: self.opts.append(('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0])) if addedopts['cachefile']: for cachefile in addedopts['cachefile']: self.opts.append(('cacheFile', cachefile)) if addedopts['cachearchive']: for cachearchive in addedopts['cachearchive']: self.opts.append(('cacheArchive', cachearchive)) if addedopts['file']: for file in addedopts['file']: if not '://' in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = 'file://' + os.path.abspath(file) self.opts.append(('file', file)) if not addedopts['inputformat']: addedopts['inputformat'] = ['auto'] inputformat_shortcuts = \ {'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat'} inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(('inputformat', inputformat)) if not addedopts['outputformat']: addedopts['outputformat'] = ['sequencefile'] if addedopts['getpath'] and addedopts['getpath'] != 'no': outputformat_shortcuts = \ {'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'} else: outputformat_shortcuts = \ {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'} outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(('outputformat', outputformat)) if addedopts['addpath'] and addedopts['addpath'][0] != 'no': self.opts.append(('cmdenv', 'dumbo_addpath=true')) pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: self.opts.append(('cmdenv', pyenv)) if addedopts['libjarstreaming'] and addedopts['libjarstreaming'][0] != 'no': addedopts['libjar'].append(streamingjar) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) fileopt = getopt(self.opts, 'file') if fileopt: tmpfiles = [] for file in fileopt: if file.startswith('file://'): self.opts.append(('file', file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles))) libjaropt = getopt(self.opts, 'libjar') if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith('file://'): self.opts.append(('file', jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars))) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for (key, value) in self.opts: if key == 'input': if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += configopts('streaming', prog, self.opts) hadoop = getopt(self.opts, 'hadoop', delete=False)[0] self.opts += configopts('streaming_' + hadoop, prog, self.opts)
def create_filesystem(self, opts): hadoopopt = getopt(opts, 'hadoop', delete=False) return StreamingFileSystem(findhadoop(hadoopopt[0]))
def cat(self, path, opts): ascodeopt = getopt(opts, 'ascode') if ascodeopt and ascodeopt[0] == 'yes': return self._cat(path, opts, dumpcode, outputs=True) else: return self._cat(path, opts, dumptext, outputs=True)
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(('file', self.prog)) addedopts = getopts(self.opts, ['hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks', 'priority', 'queue', 'cachefile', 'cachearchive', 'file', 'codewritable', 'addpath', 'getpath', 'python', 'streamoutput', 'pypath']) hadoop = findhadoop(addedopts['hadoop'][0]) streamingjar = findjar(hadoop, 'streaming') if not streamingjar: print >> sys.stderr, 'ERROR: Streaming jar not found' return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__) if modpath.endswith('.egg'): addedopts['libegg'].append(modpath) else: self.opts.append(('file', modpath)) self.opts.append(('jobconf', 'stream.map.input=typedbytes')) self.opts.append(('jobconf', 'stream.reduce.input=typedbytes')) if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0': self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.map.output=' + id_)) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) else: self.opts.append(('jobconf', 'stream.map.output=typedbytes')) if addedopts['streamoutput']: id_ = addedopts['streamoutput'][0] self.opts.append(('jobconf', 'stream.reduce.output=' + id_)) else: self.opts.append(('jobconf', 'stream.reduce.output=typedbytes')) if not addedopts['name']: self.opts.append(('jobconf', 'mapred.job.name=' + self.prog.split('/')[-1])) else: self.opts.append(('jobconf', 'mapred.job.name=%s' % addedopts['name'][0])) if addedopts['nummaptasks']: self.opts.append(('jobconf', 'mapred.map.tasks=%s' % addedopts['nummaptasks'][0])) if addedopts['numreducetasks']: numreducetasks = int(addedopts['numreducetasks'][0]) self.opts.append(('numReduceTasks', str(numreducetasks))) if addedopts['priority']: self.opts.append(('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0])) if addedopts['queue']: self.opts.append(('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0])) if addedopts['cachefile']: for cachefile in addedopts['cachefile']: self.opts.append(('cacheFile', cachefile)) if addedopts['cachearchive']: for cachearchive in addedopts['cachearchive']: self.opts.append(('cacheArchive', cachearchive)) if addedopts['file']: for file in addedopts['file']: if not '://' in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = 'file://' + os.path.abspath(file) self.opts.append(('file', file)) if not addedopts['inputformat']: addedopts['inputformat'] = ['auto'] inputformat_shortcuts = \ {'code': 'org.apache.hadoop.streaming.AutoInputFormat', 'text': 'org.apache.hadoop.mapred.TextInputFormat', 'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat', 'auto': 'org.apache.hadoop.streaming.AutoInputFormat'} inputformat_shortcuts.update(configopts('inputformats', self.prog)) inputformat = addedopts['inputformat'][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(('inputformat', inputformat)) if not addedopts['outputformat']: addedopts['outputformat'] = ['sequencefile'] if addedopts['getpath'] and addedopts['getpath'] != 'no': outputformat_shortcuts = \ {'code': 'fm.last.feathers.output.MultipleSequenceFiles', 'text': 'fm.last.feathers.output.MultipleTextFiles', 'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat', 'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'} else: outputformat_shortcuts = \ {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat', 'text': 'org.apache.hadoop.mapred.TextOutputFormat', 'raw': 'fm.last.feathers.output.RawFileOutputFormat', 'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'} outputformat_shortcuts.update(configopts('outputformats', self.prog)) outputformat = addedopts['outputformat'][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(('outputformat', outputformat)) if addedopts['addpath'] and addedopts['addpath'][0] != 'no': self.opts.append(('cmdenv', 'dumbo_addpath=true')) pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts, shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True, extrapaths=addedopts['pypath']) if pyenv: self.opts.append(('cmdenv', pyenv)) hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', self.opts, shortcuts=dict(configopts('jars', self.prog))) fileopt = getopt(self.opts, 'file') if fileopt: tmpfiles = [] for file in fileopt: if file.startswith('file://'): self.opts.append(('file', file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles))) libjaropt = getopt(self.opts, 'libjar') if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith('file://'): self.opts.append(('file', jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars))) cmd = hadoop + '/bin/hadoop jar ' + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes': for (key, value) in self.opts: if key == 'input': if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval
def __init__(self, prog, opts): Iteration.__init__(self, prog, opts) self.opts += configopts("streaming", prog, self.opts) hadoop = getopt(self.opts, "hadoop", delete=False)[0] self.opts += configopts("streaming_" + hadoop, prog, self.opts)
def matches(self, opts): return bool(getopt(opts, 'punix', delete=False))
def main(module=None): if module is None: import __main__ module = __main__ intask = len(sys.argv) > 1 and sys.argv[1][0] != '-' opts = parseargs(sys.argv[1:]) if intask: input_paths = os.environ['FLOW_INPUTS'].split(';') output_paths = os.environ['FLOW_OUTPUTS'].split(';') else: sequential = 'yes' in getopt(opts, 'seq') input_paths = getopt(opts, 'input') output_paths = getopt(opts, 'output') if any(';' in path for path in input_paths): print >> sys.stderr, "ERROR: Input paths cannot contain semi-colons" sys.exit(1) if any(';' in path for path in output_paths): print >> sys.stderr, "ERROR: Output paths cannot contain semi-colons" sys.exit(1) if any(',' in path for path in output_paths): print >> sys.stderr, "ERROR: Output paths cannot contain commas" sys.exit(1) print >> sys.stderr, "INFO: Flow inputs: %s" % input_paths print >> sys.stderr, "INFO: Flow outputs: %s" % output_paths flow = Flow(opts, input_paths, output_paths) # call special init function to initialize the flow positional_inputs = [] named_inputs = {} for path_string in input_paths: if '=' in path_string: name, value = path_string.split('=',1) named_inputs[name] = ResultSet.from_string(value) else: positional_inputs.append(ResultSet.from_string(path_string)) outputs = module.init(flow, *positional_inputs, **named_inputs) if type(outputs) is ResultSet: outputs = [outputs] for resultset, path in zip(outputs, output_paths): if len(resultset) > 1: print >> sys.stderr, "ERROR: Final outputs must be singleton resultsets" sys.exit(1) output = resultset[0] output.path = path output.temporary = False if intask: iterarg = 0 if len(sys.argv) > 2: iterarg = int(sys.argv[2]) flow.run_task(iterarg) else: if sequential: flow.run_all_sequential() else: flow.run_all()
def create_iteration(self, opts): progopt = getopt(opts, 'prog') return UnixIteration(progopt[0], opts)
def create_iteration(self, opts): progopt = getopt(opts, "prog") return StreamingIteration(progopt[0], opts)
def matches(self, opts): return bool(getopt(opts, 'hadoop', delete=False))
def create_filesystem(self, opts): hadoopopt = getopt(opts, "hadoop", delete=False) return StreamingFileSystem(findhadoop(hadoopopt[0]))
def run(self): retval = Iteration.run(self) if retval != 0: return retval if os.path.exists(self.prog): self.opts.append(("file", self.prog)) addedopts = getopts( self.opts, [ "hadoop", "name", "delinputs", "libegg", "libjar", "inputformat", "outputformat", "nummaptasks", "numreducetasks", "priority", "queue", "cachefile", "cachearchive", "file", "codewritable", "addpath", "getpath", "python", "streamoutput", "pypath", ], ) hadoop = findhadoop(addedopts["hadoop"][0]) streamingjar = findjar(hadoop, "streaming") if not streamingjar: print >> sys.stderr, "ERROR: Streaming jar not found" return 1 try: import typedbytes except ImportError: print >> sys.stderr, 'ERROR: "typedbytes" module not found' return 1 modpath = re.sub("\.egg.*$", ".egg", typedbytes.__file__) if modpath.endswith(".egg"): addedopts["libegg"].append(modpath) else: self.opts.append(("file", modpath)) self.opts.append(("jobconf", "stream.map.input=typedbytes")) self.opts.append(("jobconf", "stream.reduce.input=typedbytes")) if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0": self.opts.append(("jobconf", "stream.reduce.output=typedbytes")) if addedopts["streamoutput"]: id_ = addedopts["streamoutput"][0] self.opts.append(("jobconf", "stream.map.output=" + id_)) else: self.opts.append(("jobconf", "stream.map.output=typedbytes")) else: self.opts.append(("jobconf", "stream.map.output=typedbytes")) if addedopts["streamoutput"]: id_ = addedopts["streamoutput"][0] self.opts.append(("jobconf", "stream.reduce.output=" + id_)) else: self.opts.append(("jobconf", "stream.reduce.output=typedbytes")) if not addedopts["name"]: self.opts.append(("jobconf", "mapred.job.name=" + self.prog.split("/")[-1])) else: self.opts.append(("jobconf", "mapred.job.name=%s" % addedopts["name"][0])) if addedopts["nummaptasks"]: self.opts.append(("jobconf", "mapred.map.tasks=%s" % addedopts["nummaptasks"][0])) if addedopts["numreducetasks"]: numreducetasks = int(addedopts["numreducetasks"][0]) self.opts.append(("numReduceTasks", str(numreducetasks))) if addedopts["priority"]: self.opts.append(("jobconf", "mapred.job.priority=%s" % addedopts["priority"][0])) if addedopts["queue"]: self.opts.append(("jobconf", "mapred.job.queue.name=%s" % addedopts["queue"][0])) if addedopts["cachefile"]: for cachefile in addedopts["cachefile"]: self.opts.append(("cacheFile", cachefile)) if addedopts["cachearchive"]: for cachearchive in addedopts["cachearchive"]: self.opts.append(("cacheArchive", cachearchive)) if addedopts["file"]: for file in addedopts["file"]: if not "://" in file: if not os.path.exists(file): raise ValueError('file "' + file + '" does not exist') file = "file://" + os.path.abspath(file) self.opts.append(("file", file)) if not addedopts["inputformat"]: addedopts["inputformat"] = ["auto"] inputformat_shortcuts = { "code": "org.apache.hadoop.streaming.AutoInputFormat", "text": "org.apache.hadoop.mapred.TextInputFormat", "sequencefile": "org.apache.hadoop.streaming.AutoInputFormat", "auto": "org.apache.hadoop.streaming.AutoInputFormat", } inputformat_shortcuts.update(configopts("inputformats", self.prog)) inputformat = addedopts["inputformat"][0] if inputformat_shortcuts.has_key(inputformat.lower()): inputformat = inputformat_shortcuts[inputformat.lower()] self.opts.append(("inputformat", inputformat)) if not addedopts["outputformat"]: addedopts["outputformat"] = ["sequencefile"] if addedopts["getpath"] and addedopts["getpath"] != "no": outputformat_shortcuts = { "code": "fm.last.feathers.output.MultipleSequenceFiles", "text": "fm.last.feathers.output.MultipleTextFiles", "raw": "fm.last.feathers.output.MultipleRawFileOutputFormat", "sequencefile": "fm.last.feathers.output.MultipleSequenceFiles", } else: outputformat_shortcuts = { "code": "org.apache.hadoop.mapred.SequenceFileOutputFormat", "text": "org.apache.hadoop.mapred.TextOutputFormat", "raw": "fm.last.feathers.output.RawFileOutputFormat", "sequencefile": "org.apache.hadoop.mapred.SequenceFileOutputFormat", } outputformat_shortcuts.update(configopts("outputformats", self.prog)) outputformat = addedopts["outputformat"][0] if outputformat_shortcuts.has_key(outputformat.lower()): outputformat = outputformat_shortcuts[outputformat.lower()] self.opts.append(("outputformat", outputformat)) if addedopts["addpath"] and addedopts["addpath"][0] != "no": self.opts.append(("cmdenv", "dumbo_addpath=true")) pyenv = envdef( "PYTHONPATH", addedopts["libegg"], "file", self.opts, shortcuts=dict(configopts("eggs", self.prog)), quote=False, trim=True, extrapaths=addedopts["pypath"], ) if pyenv: self.opts.append(("cmdenv", pyenv)) hadenv = envdef( "HADOOP_CLASSPATH", addedopts["libjar"], "libjar", self.opts, shortcuts=dict(configopts("jars", self.prog)) ) fileopt = getopt(self.opts, "file") if fileopt: tmpfiles = [] for file in fileopt: if file.startswith("file://"): self.opts.append(("file", file[7:])) else: tmpfiles.append(file) if tmpfiles: self.opts.append(("jobconf", "tmpfiles=" + ",".join(tmpfiles))) libjaropt = getopt(self.opts, "libjar") if libjaropt: tmpjars = [] for jar in libjaropt: if jar.startswith("file://"): self.opts.append(("file", jar[7:])) else: tmpjars.append(jar) if tmpjars: self.opts.append(("jobconf", "tmpjars=" + ",".join(tmpjars))) cmd = hadoop + "/bin/hadoop jar " + streamingjar retval = execute(cmd, self.opts, hadenv) if addedopts["delinputs"] and addedopts["delinputs"][0] == "yes": for (key, value) in self.opts: if key == "input": if os.path.exists(hadoop + "/bin/hdfs"): hdfs = hadoop + "/bin/hdfs" else: hdfs = hadoop + "/bin/hadoop" execute("%s dfs -rmr '%s'" % (hdfs, value)) return retval