def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        addedopts = getopts(self.opts, [
            'input', 'output', 'mapper', 'reducer', 'libegg', 'delinputs',
            'cmdenv', 'pv', 'addpath', 'inputformat', 'outputformat',
            'numreducetasks', 'python', 'pypath', 'sorttmpdir', 'sortbufsize'
        ])
        (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
        if not addedopts['input'] or not addedopts['output']:
            print >> sys.stderr, 'ERROR: input or output not specified'
            return 1
        inputs = reduce(operator.concat,
                        (input.split(' ') for input in addedopts['input']))
        output = addedopts['output'][0]
        pyenv = envdef('PYTHONPATH',
                       addedopts['libegg'],
                       shortcuts=dict(configopts('eggs', self.prog)),
                       extrapaths=addedopts['pypath'])
        cmdenv = ' '.join("%s='%s'" % tuple(arg.split('='))
                          for arg in addedopts['cmdenv'])
        if addedopts['pv'] and addedopts['pv'][0] == 'yes':
            mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs)
            (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ')
        else:
            (mpv, spv, rpv) = ('', '', '')

        (sorttmpdir, sortbufsize) = ('', '')
        if addedopts['sorttmpdir']:
            sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0]
        if addedopts['sortbufsize']:
            sortbufsize = "-S %s" % addedopts['sortbufsize'][0]

        python = addedopts['python'][0]
        encodepipe = pyenv + ' ' + python + \
                     ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs)
        if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
            encodepipe += ' -alreadycoded yes'
        if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
            encodepipe += ' -addpath yes'
        if addedopts['numreducetasks'] and addedopts['numreducetasks'][
                0] == '0':
            retval = execute("%s | %s %s %s %s > '%s'" %
                             (encodepipe, pyenv, cmdenv, mapper, mpv, output))
        else:
            retval = execute(
                "%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'" %
                (encodepipe, pyenv, cmdenv, mapper, mpv, sorttmpdir,
                 sortbufsize, spv, pyenv, cmdenv, reducer, rpv, output))
        if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
            for file in addedopts['input']:
                execute('rm ' + file)
        return retval
예제 #2
0
파일: cmd.py 프로젝트: CyaLiven/dumbo
def start(prog,
          opts,
          stdout=sys.stdout,
          stderr=sys.stderr):

    opts = Options(opts)
    opts += Options(configopts('common'))
    opts += Options(configopts('start'))

    pyenv = envdef('PYTHONPATH', opts['libegg'],
                   shortcuts=dict(configopts('eggs', prog)),
                   extrapaths=sys.path)

    if not opts['prog']:
        opts.add('prog', prog)

    if not os.path.exists(prog):
        if prog.endswith(".py"):
            print >> sys.stderr, 'ERROR:', prog, 'does not exist'
            return 1
        prog = '-m ' + prog

    return execute("%s %s" % (sys.executable, prog),
                   opts,
                   pyenv,
                   stdout=stdout,
                   stderr=stderr,
                   printcmd=False)
예제 #3
0
파일: cmd.py 프로젝트: soxofaan/dumbo
def start(prog, opts, stdout=sys.stdout, stderr=sys.stderr):

    opts = Options(opts)
    opts += Options(configopts('common'))
    opts += Options(configopts('start'))

    pyenv = envdef('PYTHONPATH',
                   opts['libegg'],
                   shortcuts=dict(configopts('eggs', prog)),
                   extrapaths=sys.path)

    if not opts['prog']:
        opts.add('prog', prog)

    if not os.path.exists(prog):
        if prog.endswith(".py"):
            print >> sys.stderr, 'ERROR:', prog, 'does not exist'
            return 1
        prog = '-m ' + prog

    return execute("%s %s" % (sys.executable, prog),
                   opts,
                   pyenv,
                   stdout=stdout,
                   stderr=stderr,
                   printcmd=False)
예제 #4
0
파일: punix.py 프로젝트: jso/dumbo
def doReduce(*args, **kwargs):
    retval = 1
    try:
        tmpdir, pyenv, cmdenv, reducer, output, shell, reducenum = args

        combinedInput = os.sep.join([tmpdir, "r-%d-all" % reducenum])

        retval = 0
        if os.path.exists(combinedInput):
            cmd = "LC_ALL=C sort -t $'\\t' --temporary-directory=%s --key=1 %s | %s %s %s > '%s'" % (tmpdir, combinedInput, pyenv, cmdenv, reducer, os.sep.join([output, "part-%05d" % reducenum]))

            cmdStderr = open(os.sep.join([tmpdir, "r-%d-status" % reducenum]), "w")
            retval = execute(cmd, stderr=cmdStderr, executable=shell)
            cmdStderr.close()

            f = open(os.sep.join([tmpdir, "r-%d-status" % reducenum]), "a")
            print >>f, "return code:", retval
            f.close()

            if not master_debug:
                # clean up
                os.remove(combinedInput)

    except Exception as e:
        f = open(os.sep.join([tmpdir, "r-%d-status" % reducenum]), "a")
        print >>f, type(e), str(e)
        f.close()

    if retval != 0:
        print "reduce %d failed" % reducenum

    return retval
예제 #5
0
파일: punix.py 프로젝트: jso/dumbo
def doMap(*args, **kwargs):
    f = None
    retval = 1
    try:
        pyenv, python, cmdenv, mapper, nReducers, tmpdir, output, addedopts, shell, mapi, filename_list, doReduces = args

        filenames = " ".join(["-file %s" % x for x in filename_list])

        encodepipe = pyenv + ' ' + python + ' -m dumbo.cmd encodepipe ' + filenames
        if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
            encodepipe += ' -alreadycoded yes'

        if doReduces:
            cmd = "%s | %s %s %s | python -m dumbo.backends.punixSplitter %d %d %s" % (encodepipe,
                                              pyenv,
                                              cmdenv,
                                              mapper,
                                              mapi,
                                              nReducers,
                                              tmpdir)

        else:
            outfile = os.sep.join([output, "part-%05d" % mapi])
            cmd = "%s | %s %s %s > '%s'" % (encodepipe,
                                              pyenv,
                                              cmdenv,
                                              mapper,
                                              outfile)
            

        cmdStderr = open(os.sep.join([tmpdir, "m-%d-status" % mapi]), "w")
        retval = execute(cmd, stderr=cmdStderr, executable=shell)
        cmdStderr.close()


        f = open(os.sep.join([tmpdir, "m-%d-status" % mapi]), "a")
        print >>f, "return code:", retval
        f.close()

        if retval != 0:
            print "map %d failed" % mapi

    except Exception as e:
        f = open(os.sep.join([tmpdir, "m-%d-status" % mapi]), "a")
        print >>f, type(e), str(e)
        f.close()

    return retval
예제 #6
0
 def rm(self, path, opts):
     return execute("%s/bin/hadoop dfs -rmr '%s'" % (self.hadoop, path),
                    printcmd=False)
예제 #7
0
파일: streaming.py 프로젝트: CyaLiven/dumbo
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        opts = self.opts
        if os.path.exists(self.prog):
            opts.add('file', self.prog)

        keys = ['hadoop', 'name', 'delinputs', 'libegg', 'libjar',
            'inputformat', 'outputformat', 'nummaptasks', 'numreducetasks',
            'priority', 'queue', 'cachefile', 'cachearchive', 'file',
            'codewritable', 'addpath', 'getpath', 'python', 'streamoutput',
            'pypath', 'hadooplib']
        addedopts = opts.filter(keys)
        opts.remove(*keys)

        hadoop = findhadoop(addedopts['hadoop'][0])
        streamingjar = findjar(hadoop, 'streaming', addedopts['hadooplib'])
        if not streamingjar:
            print >> sys.stderr, 'ERROR: Streaming jar not found'
            return 1

        try:
            import typedbytes
        except ImportError:
            print >> sys.stderr, 'ERROR: "typedbytes" module not found'
            return 1
        modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
        if modpath.endswith('.egg'):
            addedopts.add('libegg', modpath)
        else:
            opts.add('file', modpath)
        opts.add('jobconf', 'stream.map.input=typedbytes')
        opts.add('jobconf', 'stream.reduce.input=typedbytes')

        if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
            opts.add('jobconf', 'stream.reduce.output=typedbytes')
            if addedopts['streamoutput']:
                id_ = addedopts['streamoutput'][0]
                opts.add('jobconf', 'stream.map.output=' + id_)
            else:
                opts.add('jobconf', 'stream.map.output=typedbytes')
        else:
            opts.add('jobconf', 'stream.map.output=typedbytes')
            if addedopts['streamoutput']:
                id_ = addedopts['streamoutput'][0]
                opts.add('jobconf', 'stream.reduce.output=' + id_)
            else:
                opts.add('jobconf', 'stream.reduce.output=typedbytes')

        progname = self.prog.split('/')[-1] if not addedopts['name'] \
                                            else addedopts['name'][0]
        opts.add('jobconf', 'mapred.job.name=%s' % progname)

        nummaptasks = addedopts['nummaptasks']
        numreducetasks = addedopts['numreducetasks']
        if nummaptasks:
            opts.add('jobconf', 'mapred.map.tasks=%s' % nummaptasks[0])
        if numreducetasks:
            opts.add('numReduceTasks', numreducetasks[0])
        if addedopts['priority']:
            opts.add('jobconf', 'mapred.job.priority=%s' % addedopts['priority'][0])
        if addedopts['queue']:
            opts.add('jobconf', 'mapred.job.queue.name=%s' % addedopts['queue'][0])

        for cachefile in addedopts['cachefile']:
            opts.add('cacheFile', cachefile)

        for cachearchive in addedopts['cachearchive']:
            opts.add('cacheArchive', cachearchive)

        for _file in addedopts['file']:
            if not '://' in _file:
                if not os.path.exists(_file):
                    raise ValueError('file "%s" does not exist' % _file)
                _file = 'file://%s' % os.path.abspath(_file)
            opts.add('file', _file)

        if not addedopts['inputformat']:
            addedopts.add('inputformat', 'auto')

        inputformat_shortcuts = {
            'code': 'org.apache.hadoop.streaming.AutoInputFormat',
            'text': 'org.apache.hadoop.mapred.TextInputFormat',
            'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
            'auto': 'org.apache.hadoop.streaming.AutoInputFormat'
        }
        inputformat_shortcuts.update(configopts('inputformats', self.prog))

        inputformat = addedopts['inputformat'][0]
        if inputformat.lower() in inputformat_shortcuts:
            inputformat = inputformat_shortcuts[inputformat.lower()]
        opts.add('inputformat', inputformat)

        if not addedopts['outputformat']:
            addedopts.add('outputformat', 'sequencefile')

        if addedopts['getpath'] and 'no' not in addedopts['getpath']:
            outputformat_shortcuts = {
                'code': 'fm.last.feathers.output.MultipleSequenceFiles',
                'text': 'fm.last.feathers.output.MultipleTextFiles',
                'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
                'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'
            }
        else:
            outputformat_shortcuts = {
                'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
                'text': 'org.apache.hadoop.mapred.TextOutputFormat',
                'raw': 'fm.last.feathers.output.RawFileOutputFormat',
                'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'
            }
        outputformat_shortcuts.update(configopts('outputformats', self.prog))

        outputformat = addedopts['outputformat'][0]
        if outputformat.lower() in outputformat_shortcuts:
            outputformat = outputformat_shortcuts[outputformat.lower()]
        opts.add('outputformat', outputformat)

        if addedopts['addpath'] and 'no' not in addedopts['addpath']:
            opts.add('cmdenv', 'dumbo_addpath=true')

        pyenv = envdef('PYTHONPATH', addedopts['libegg'], 'file', self.opts,
            shortcuts=dict(configopts('eggs', self.prog)), quote=False, trim=True,
            extrapaths=addedopts['pypath'])
        if pyenv:
            opts.add('cmdenv', pyenv)

        hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar',
            self.opts, shortcuts=dict(configopts('jars', self.prog)))

        tmpfiles = []
        for _file in opts.pop('file'):
            if _file.startswith('file://'):
                opts.add('file', _file[7:])
            else:
                tmpfiles.append(_file)
        if tmpfiles:
            opts.add('jobconf', 'tmpfiles=%s' % ','.join(tmpfiles))

        tmpjars = []
        for jar in opts.pop('libjar'):
            if jar.startswith('file://'):
                opts.add('file', jar[7:])
            else:
                tmpjars.append(jar)
        if tmpjars:
            opts.add('jobconf', 'tmpjars=%s' % ','.join(tmpjars))

        cmd = hadoop + '/bin/hadoop jar ' + streamingjar
        retval = execute(cmd, opts, hadenv)

        if 'yes' in addedopts['delinputs']:
            inputs = opts['input']
            for path in inputs:
                execute("%s/bin/hadoop fs -rmr '%s'" % (hadoop, path))
        return retval
예제 #8
0
 def get(self, path1, path2, opts):
     return execute("%s -get '%s' '%s'" % (self.hdfs, path1, path2),
                    printcmd=False)
예제 #9
0
파일: unix.py 프로젝트: jso/dumbo
 def rm(self, path, opts):
     return execute("rm -rf '%s'" % path, printcmd=False, executable=self.shell)
예제 #10
0
파일: unix.py 프로젝트: VickiFu/dumbo
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        addedopts = getopts(self.opts, ['input',
                                        'output',
                                        'mapper',
                                        'reducer',
                                        'libegg',
                                        'delinputs',
                                        'cmdenv',
                                        'pv',
                                        'addpath',
                                        'inputformat',
                                        'outputformat',
                                        'numreducetasks',
                                        'python',
                                        'pypath',
                                        'sorttmpdir',
                                        'sortbufsize'])
        (mapper, reducer) = (addedopts['mapper'][0], addedopts['reducer'][0])
        if not addedopts['input'] or not addedopts['output']:
            print >> sys.stderr, 'ERROR: input or output not specified'
            return 1
        inputs = reduce(operator.concat, (input.split(' ') for input in
                        addedopts['input']))
        output = addedopts['output'][0]
        pyenv = envdef('PYTHONPATH', addedopts['libegg'],
                       shortcuts=dict(configopts('eggs', self.prog)),
                       extrapaths=addedopts['pypath'])
        cmdenv = ' '.join("%s='%s'" % tuple(arg.split('=')) for arg in
                          addedopts['cmdenv'])
        if addedopts['pv'] and addedopts['pv'][0] == 'yes':
            mpv = '| pv -s `du -b %s | cut -f 1` -cN map ' % ' '.join(inputs)
            (spv, rpv) = ('| pv -cN sort ', '| pv -cN reduce ')
        else:
            (mpv, spv, rpv) = ('', '', '')

        (sorttmpdir, sortbufsize) = ('', '')
        if addedopts['sorttmpdir']:
            sorttmpdir = "-T %s" % addedopts['sorttmpdir'][0]
        if addedopts['sortbufsize']:
            sortbufsize = "-S %s" % addedopts['sortbufsize'][0]

        python = addedopts['python'][0]
        encodepipe = pyenv + ' ' + python + \
                     ' -m dumbo.cmd encodepipe -file ' + ' -file '.join(inputs)
        if addedopts['inputformat'] and addedopts['inputformat'][0] == 'code':
            encodepipe += ' -alreadycoded yes'
        if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
            encodepipe += ' -addpath yes'
        if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
            retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe,
                                                          pyenv,
                                                          cmdenv,
                                                          mapper,
                                                          mpv,
                                                          output))
        else:
            retval = execute("%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'"
                             % (encodepipe,
                                pyenv,
                                cmdenv,
                                mapper,
                                mpv,
                                sorttmpdir,
                                sortbufsize,
                                spv,
                                pyenv,
                                cmdenv,
                                reducer,
                                rpv,
                                output))
        if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
            for file in addedopts['input']:
                execute('rm ' + file)
        return retval
예제 #11
0
 def exists(self, path, opts):
     shellcmd = "%s -stat '%s' >/dev/null 2>&1"
     return 1 - int(
         execute(shellcmd % (self.hdfs, path), printcmd=False) == 0)
 def rm(self, path, opts):
     return execute("rm -rf '%s'" % path, printcmd=False)
예제 #13
0
파일: unix.py 프로젝트: VickiFu/dumbo
 def rm(self, path, opts):
     return execute("rm -rf '%s'" % path, printcmd=False)
 def ls(self, path, opts):
     return execute("ls -l '%s'" % path, printcmd=False)
 def exists(self, path, opts):
     return execute("test -e '%s'" % path, printcmd=False)
예제 #16
0
파일: unix.py 프로젝트: jso/dumbo
 def ls(self, path, opts):
     return execute("ls -l '%s'" % path, printcmd=False, executable=self.shell)
 def get(self, path1, path2, opts):
     return execute("cp '%s' '%s'" % (path1, path2), printcmd=False)
예제 #18
0
파일: unix.py 프로젝트: jso/dumbo
 def get(self, path1, path2, opts):
     return execute("cp '%s' '%s'" % (path1, path2), printcmd=False, executable=self.shell)
예제 #19
0
파일: unix.py 프로젝트: VickiFu/dumbo
 def ls(self, path, opts):
     return execute("ls -l '%s'" % path, printcmd=False)
예제 #20
0
파일: streaming.py 프로젝트: ebottabi/dumbo
 def put(self, path1, path2, opts):
     return execute("%s dfs -put '%s' '%s'" % (self.hdfs, path1, path2),
                    printcmd=False)
예제 #21
0
파일: unix.py 프로젝트: VickiFu/dumbo
 def exists(self, path, opts):
     return execute("test -e '%s'" % path, printcmd=False)
예제 #22
0
 def get(self, path1, path2, opts):
     return execute("%s/bin/hadoop dfs -get '%s' '%s'" % (self.hadoop, path1,
                    path2), printcmd=False)
예제 #23
0
파일: unix.py 프로젝트: VickiFu/dumbo
 def get(self, path1, path2, opts):
     return execute("cp '%s' '%s'" % (path1, path2), printcmd=False)
예제 #24
0
파일: streaming.py 프로젝트: dgleich/dumbo
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'libjarstreaming',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = getopt(self.opts,'streamingjar')
     if streamingjar is None or len(streamingjar)==0:
         streamingjar = findjar(hadoop,'streaming')
     else:
         streamingjar = streamingjar[0]
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
         
     # add typedbytes to path
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
         
     # add ctypedbytes to job
     try: 
         import ctypedbytes
         print >>sys.stderr, 'INFO: "ctypedbytes" found!'
         modpath = re.sub('\.egg.*$', '.egg', ctypedbytes.__file__)
         if modpath.endswith('.egg'):            
             addedopts['libegg'].append(modpath)
     except ImportError:
         pass        
         
         
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     if addedopts['libjarstreaming'] and addedopts['libjarstreaming'][0] != 'no':
         addedopts['libjar'].append(streamingjar)
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
예제 #25
0
 def ls(self, path, opts):
     return execute("%s -ls '%s'" % (self.hdfs, path), printcmd=False)
예제 #26
0
파일: streaming.py 프로젝트: nbyloff/dumbo
 def exists(self, path, opts):
     shellcmd = "%s dfs -stat '%s' >/dev/null 2>&1"
     return 1 - int(execute(shellcmd % (self.hdfs, path), printcmd=False) == 0)
예제 #27
0
 def rm(self, path, opts):
     return execute("%s -rmr '%s'" % (self.hdfs, path), printcmd=False)
예제 #28
0
파일: streaming.py 프로젝트: nbyloff/dumbo
 def get(self, path1, path2, opts):
     return execute("%s dfs -get '%s' '%s'" % (self.hdfs, path1, path2), printcmd=False)
예제 #29
0
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval
        opts = self.opts
        if os.path.exists(self.prog):
            opts.add('file', self.prog)

        keys = [
            'hadoop', 'name', 'delinputs', 'libegg', 'libjar', 'inputformat',
            'outputformat', 'nummaptasks', 'numreducetasks', 'priority',
            'queue', 'cachefile', 'cachearchive', 'file', 'codewritable',
            'addpath', 'getpath', 'python', 'streamoutput', 'pypath',
            'hadooplib'
        ]
        addedopts = opts.filter(keys)
        opts.remove(*keys)

        hadoop = findhadoop(addedopts['hadoop'][0])
        streamingjar = findjar(hadoop, 'streaming', addedopts['hadooplib'])
        if not streamingjar:
            print >> sys.stderr, 'ERROR: Streaming jar not found'
            return 1

        try:
            import typedbytes
        except ImportError:
            print >> sys.stderr, 'ERROR: "typedbytes" module not found'
            return 1
        modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
        if modpath.endswith('.egg'):
            addedopts.add('libegg', modpath)
        else:
            opts.add('file', modpath)
        opts.add('jobconf', 'stream.map.input=typedbytes')
        opts.add('jobconf', 'stream.reduce.input=typedbytes')

        if addedopts['numreducetasks'] and addedopts['numreducetasks'][
                0] == '0':
            opts.add('jobconf', 'stream.reduce.output=typedbytes')
            if addedopts['streamoutput']:
                id_ = addedopts['streamoutput'][0]
                opts.add('jobconf', 'stream.map.output=' + id_)
            else:
                opts.add('jobconf', 'stream.map.output=typedbytes')
        else:
            opts.add('jobconf', 'stream.map.output=typedbytes')
            if addedopts['streamoutput']:
                id_ = addedopts['streamoutput'][0]
                opts.add('jobconf', 'stream.reduce.output=' + id_)
            else:
                opts.add('jobconf', 'stream.reduce.output=typedbytes')

        progname = self.prog.split('/')[-1] if not addedopts['name'] \
                                            else addedopts['name'][0]
        opts.add('jobconf', 'mapred.job.name=%s' % progname)

        nummaptasks = addedopts['nummaptasks']
        numreducetasks = addedopts['numreducetasks']
        if nummaptasks:
            opts.add('jobconf', 'mapred.map.tasks=%s' % nummaptasks[0])
        if numreducetasks:
            opts.add('numReduceTasks', numreducetasks[0])
        if addedopts['priority']:
            opts.add('jobconf',
                     'mapred.job.priority=%s' % addedopts['priority'][0])
        if addedopts['queue']:
            opts.add('jobconf',
                     'mapred.job.queue.name=%s' % addedopts['queue'][0])

        for cachefile in addedopts['cachefile']:
            opts.add('cacheFile', cachefile)

        for cachearchive in addedopts['cachearchive']:
            opts.add('cacheArchive', cachearchive)

        for _file in addedopts['file']:
            if not '://' in _file:
                if not os.path.exists(_file):
                    raise ValueError('file "%s" does not exist' % _file)
                _file = 'file://%s' % os.path.abspath(_file)
            opts.add('file', _file)

        if not addedopts['inputformat']:
            addedopts.add('inputformat', 'auto')

        inputformat_shortcuts = {
            'code': 'org.apache.hadoop.streaming.AutoInputFormat',
            'text': 'org.apache.hadoop.mapred.TextInputFormat',
            'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
            'auto': 'org.apache.hadoop.streaming.AutoInputFormat'
        }
        inputformat_shortcuts.update(configopts('inputformats', self.prog))

        inputformat = addedopts['inputformat'][0]
        if inputformat.lower() in inputformat_shortcuts:
            inputformat = inputformat_shortcuts[inputformat.lower()]
        opts.add('inputformat', inputformat)

        if not addedopts['outputformat']:
            addedopts.add('outputformat', 'sequencefile')

        if addedopts['getpath'] and 'no' not in addedopts['getpath']:
            outputformat_shortcuts = {
                'code': 'fm.last.feathers.output.MultipleSequenceFiles',
                'text': 'fm.last.feathers.output.MultipleTextFiles',
                'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
                'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'
            }
        else:
            outputformat_shortcuts = {
                'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
                'text': 'org.apache.hadoop.mapred.TextOutputFormat',
                'raw': 'fm.last.feathers.output.RawFileOutputFormat',
                'sequencefile':
                'org.apache.hadoop.mapred.SequenceFileOutputFormat'
            }
        outputformat_shortcuts.update(configopts('outputformats', self.prog))

        outputformat = addedopts['outputformat'][0]
        if outputformat.lower() in outputformat_shortcuts:
            outputformat = outputformat_shortcuts[outputformat.lower()]
        opts.add('outputformat', outputformat)

        if addedopts['addpath'] and 'no' not in addedopts['addpath']:
            opts.add('cmdenv', 'dumbo_addpath=true')

        pyenv = envdef('PYTHONPATH',
                       addedopts['libegg'],
                       'file',
                       self.opts,
                       shortcuts=dict(configopts('eggs', self.prog)),
                       quote=False,
                       trim=True,
                       extrapaths=addedopts['pypath'])
        if pyenv:
            opts.add('cmdenv', pyenv)

        hadenv = envdef('HADOOP_CLASSPATH',
                        addedopts['libjar'],
                        'libjar',
                        self.opts,
                        shortcuts=dict(configopts('jars', self.prog)))

        tmpfiles = []
        for _file in opts.pop('file'):
            if _file.startswith('file://'):
                opts.add('file', _file[7:])
            else:
                tmpfiles.append(_file)
        if tmpfiles:
            opts.add('jobconf', 'tmpfiles=%s' % ','.join(tmpfiles))

        tmpjars = []
        for jar in opts.pop('libjar'):
            if jar.startswith('file://'):
                opts.add('file', jar[7:])
            else:
                tmpjars.append(jar)
        if tmpjars:
            opts.add('jobconf', 'tmpjars=%s' % ','.join(tmpjars))

        cmd = hadoop + '/bin/hadoop jar ' + streamingjar
        retval = execute(cmd, opts, hadenv)

        if 'yes' in addedopts['delinputs']:
            inputs = opts['input']
            for path in inputs:
                execute("%s/bin/hadoop fs -rmr '%s'" % (hadoop, path))
        return retval
예제 #30
0
파일: unix.py 프로젝트: jso/dumbo
 def exists(self, path, opts):
     return execute("test -e '%s'" % path, printcmd=False, executable=self.shell)
예제 #31
0
파일: streaming.py 프로젝트: nbyloff/dumbo
 def ls(self, path, opts):
     return execute("%s dfs -ls '%s'" % (self.hdfs, path), printcmd=False)
예제 #32
0
파일: unix.py 프로젝트: joskid/dumbo
    def run(self):
        retval = Iteration.run(self)
        if retval != 0:
            return retval

        opts = self.opts
        keys = [
            "input",
            "output",
            "mapper",
            "reducer",
            "libegg",
            "delinputs",
            "cmdenv",
            "pv",
            "addpath",
            "inputformat",
            "outputformat",
            "numreducetasks",
            "python",
            "pypath",
            "sorttmpdir",
            "sortbufsize",
        ]
        addedopts = opts.filter(keys)
        opts.remove(*keys)

        mapper, reducer = addedopts["mapper"][0], addedopts["reducer"][0]
        if not addedopts["input"] or not addedopts["output"]:
            print >>sys.stderr, "ERROR: input or output not specified"
            return 1

        _inputs = addedopts["input"]
        _output = addedopts["output"]

        inputs = reduce(operator.concat, (inp.split(" ") for inp in _inputs))
        output = _output[0]

        pyenv = envdef(
            "PYTHONPATH",
            addedopts["libegg"],
            shortcuts=dict(configopts("eggs", self.prog)),
            extrapaths=addedopts["pypath"],
        )
        cmdenv = " ".join("%s='%s'" % tuple(arg.split("=")) for arg in addedopts["cmdenv"])

        if "yes" in addedopts["pv"]:
            mpv = "| pv -s `du -b %s | cut -f 1` -cN map " % " ".join(inputs)
            (spv, rpv) = ("| pv -cN sort ", "| pv -cN reduce ")
        else:
            (mpv, spv, rpv) = ("", "", "")

        sorttmpdir, sortbufsize = "", ""
        if addedopts["sorttmpdir"]:
            sorttmpdir = "-T %s" % addedopts["sorttmpdir"][0]
        if addedopts["sortbufsize"]:
            sortbufsize = "-S %s" % addedopts["sortbufsize"][0]

        python = addedopts["python"][0]
        encodepipe = pyenv + " " + python + " -m dumbo.cmd encodepipe -file " + " -file ".join(inputs)

        if "code" in addedopts["inputformat"]:
            encodepipe += " -alreadycoded yes"
        if addedopts["addpath"] and "no" not in addedopts["addpath"]:
            encodepipe += " -addpath yes"
        if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0":
            retval = execute("%s | %s %s %s %s > '%s'" % (encodepipe, pyenv, cmdenv, mapper, mpv, output))
        else:
            retval = execute(
                "%s | %s %s %s %s| LC_ALL=C sort %s %s %s| %s %s %s %s> '%s'"
                % (
                    encodepipe,
                    pyenv,
                    cmdenv,
                    mapper,
                    mpv,
                    sorttmpdir,
                    sortbufsize,
                    spv,
                    pyenv,
                    cmdenv,
                    reducer,
                    rpv,
                    output,
                )
            )

        if "yes" in addedopts["delinputs"]:
            for _file in addedopts["input"]:
                execute("rm " + _file)
        return retval
예제 #33
0
파일: streaming.py 프로젝트: nbyloff/dumbo
 def rm(self, path, opts):
     return execute("%s dfs -rmr '%s'" % (self.hdfs, path), printcmd=False)
예제 #34
0
파일: streaming.py 프로젝트: CyaLiven/dumbo
 def put(self, path1, path2, opts):
     return execute("%s -put '%s' '%s'" % (self.hdfs, path1,
                    path2), printcmd=False)
예제 #35
0
파일: streaming.py 프로젝트: nbyloff/dumbo
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(("file", self.prog))
     addedopts = getopts(
         self.opts,
         [
             "hadoop",
             "name",
             "delinputs",
             "libegg",
             "libjar",
             "inputformat",
             "outputformat",
             "nummaptasks",
             "numreducetasks",
             "priority",
             "queue",
             "cachefile",
             "cachearchive",
             "file",
             "codewritable",
             "addpath",
             "getpath",
             "python",
             "streamoutput",
             "pypath",
         ],
     )
     hadoop = findhadoop(addedopts["hadoop"][0])
     streamingjar = findjar(hadoop, "streaming")
     if not streamingjar:
         print >> sys.stderr, "ERROR: Streaming jar not found"
         return 1
     try:
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub("\.egg.*$", ".egg", typedbytes.__file__)
     if modpath.endswith(".egg"):
         addedopts["libegg"].append(modpath)
     else:
         self.opts.append(("file", modpath))
     self.opts.append(("jobconf", "stream.map.input=typedbytes"))
     self.opts.append(("jobconf", "stream.reduce.input=typedbytes"))
     if addedopts["numreducetasks"] and addedopts["numreducetasks"][0] == "0":
         self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.map.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.map.output=typedbytes"))
     else:
         self.opts.append(("jobconf", "stream.map.output=typedbytes"))
         if addedopts["streamoutput"]:
             id_ = addedopts["streamoutput"][0]
             self.opts.append(("jobconf", "stream.reduce.output=" + id_))
         else:
             self.opts.append(("jobconf", "stream.reduce.output=typedbytes"))
     if not addedopts["name"]:
         self.opts.append(("jobconf", "mapred.job.name=" + self.prog.split("/")[-1]))
     else:
         self.opts.append(("jobconf", "mapred.job.name=%s" % addedopts["name"][0]))
     if addedopts["nummaptasks"]:
         self.opts.append(("jobconf", "mapred.map.tasks=%s" % addedopts["nummaptasks"][0]))
     if addedopts["numreducetasks"]:
         numreducetasks = int(addedopts["numreducetasks"][0])
         self.opts.append(("numReduceTasks", str(numreducetasks)))
     if addedopts["priority"]:
         self.opts.append(("jobconf", "mapred.job.priority=%s" % addedopts["priority"][0]))
     if addedopts["queue"]:
         self.opts.append(("jobconf", "mapred.job.queue.name=%s" % addedopts["queue"][0]))
     if addedopts["cachefile"]:
         for cachefile in addedopts["cachefile"]:
             self.opts.append(("cacheFile", cachefile))
     if addedopts["cachearchive"]:
         for cachearchive in addedopts["cachearchive"]:
             self.opts.append(("cacheArchive", cachearchive))
     if addedopts["file"]:
         for file in addedopts["file"]:
             if not "://" in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = "file://" + os.path.abspath(file)
             self.opts.append(("file", file))
     if not addedopts["inputformat"]:
         addedopts["inputformat"] = ["auto"]
     inputformat_shortcuts = {
         "code": "org.apache.hadoop.streaming.AutoInputFormat",
         "text": "org.apache.hadoop.mapred.TextInputFormat",
         "sequencefile": "org.apache.hadoop.streaming.AutoInputFormat",
         "auto": "org.apache.hadoop.streaming.AutoInputFormat",
     }
     inputformat_shortcuts.update(configopts("inputformats", self.prog))
     inputformat = addedopts["inputformat"][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(("inputformat", inputformat))
     if not addedopts["outputformat"]:
         addedopts["outputformat"] = ["sequencefile"]
     if addedopts["getpath"] and addedopts["getpath"] != "no":
         outputformat_shortcuts = {
             "code": "fm.last.feathers.output.MultipleSequenceFiles",
             "text": "fm.last.feathers.output.MultipleTextFiles",
             "raw": "fm.last.feathers.output.MultipleRawFileOutputFormat",
             "sequencefile": "fm.last.feathers.output.MultipleSequenceFiles",
         }
     else:
         outputformat_shortcuts = {
             "code": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
             "text": "org.apache.hadoop.mapred.TextOutputFormat",
             "raw": "fm.last.feathers.output.RawFileOutputFormat",
             "sequencefile": "org.apache.hadoop.mapred.SequenceFileOutputFormat",
         }
     outputformat_shortcuts.update(configopts("outputformats", self.prog))
     outputformat = addedopts["outputformat"][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(("outputformat", outputformat))
     if addedopts["addpath"] and addedopts["addpath"][0] != "no":
         self.opts.append(("cmdenv", "dumbo_addpath=true"))
     pyenv = envdef(
         "PYTHONPATH",
         addedopts["libegg"],
         "file",
         self.opts,
         shortcuts=dict(configopts("eggs", self.prog)),
         quote=False,
         trim=True,
         extrapaths=addedopts["pypath"],
     )
     if pyenv:
         self.opts.append(("cmdenv", pyenv))
     hadenv = envdef(
         "HADOOP_CLASSPATH", addedopts["libjar"], "libjar", self.opts, shortcuts=dict(configopts("jars", self.prog))
     )
     fileopt = getopt(self.opts, "file")
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith("file://"):
                 self.opts.append(("file", file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(("jobconf", "tmpfiles=" + ",".join(tmpfiles)))
     libjaropt = getopt(self.opts, "libjar")
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith("file://"):
                 self.opts.append(("file", jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(("jobconf", "tmpjars=" + ",".join(tmpjars)))
     cmd = hadoop + "/bin/hadoop jar " + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts["delinputs"] and addedopts["delinputs"][0] == "yes":
         for (key, value) in self.opts:
             if key == "input":
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval
예제 #36
0
 def run(self):
     retval = Iteration.run(self)
     if retval != 0:
         return retval
     if os.path.exists(self.prog):
         self.opts.append(('file', self.prog))
     addedopts = getopts(self.opts, ['hadoop',
                                     'name',
                                     'delinputs',
                                     'libegg',
                                     'libjar',
                                     'inputformat',
                                     'outputformat',
                                     'nummaptasks',
                                     'numreducetasks',
                                     'priority',
                                     'queue',
                                     'cachefile',
                                     'cachearchive',
                                     'file',
                                     'codewritable',
                                     'addpath',
                                     'getpath',
                                     'python',
                                     'streamoutput',
                                     'pypath'])
     hadoop = findhadoop(addedopts['hadoop'][0])
     streamingjar = findjar(hadoop, 'streaming')
     if not streamingjar:
         print >> sys.stderr, 'ERROR: Streaming jar not found'
         return 1
     try: 
         import typedbytes
     except ImportError:
         print >> sys.stderr, 'ERROR: "typedbytes" module not found'
         return 1
     modpath = re.sub('\.egg.*$', '.egg', typedbytes.__file__)
     if modpath.endswith('.egg'):            
         addedopts['libegg'].append(modpath)    
     else:
         self.opts.append(('file', modpath)) 
     self.opts.append(('jobconf', 'stream.map.input=typedbytes'))
     self.opts.append(('jobconf', 'stream.reduce.input=typedbytes'))
     if addedopts['numreducetasks'] and addedopts['numreducetasks'][0] == '0':
         self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.map.output=' + id_))
         else: 
             self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
     else:
         self.opts.append(('jobconf', 'stream.map.output=typedbytes'))
         if addedopts['streamoutput']:
             id_ = addedopts['streamoutput'][0]
             self.opts.append(('jobconf', 'stream.reduce.output=' + id_))
         else:
             self.opts.append(('jobconf', 'stream.reduce.output=typedbytes'))
     if not addedopts['name']:
         self.opts.append(('jobconf', 'mapred.job.name='
                           + self.prog.split('/')[-1]))
     else:
         self.opts.append(('jobconf', 'mapred.job.name=%s'
                           % addedopts['name'][0]))
     if addedopts['nummaptasks']:
         self.opts.append(('jobconf', 'mapred.map.tasks=%s'
                           % addedopts['nummaptasks'][0]))
     if addedopts['numreducetasks']:
         numreducetasks = int(addedopts['numreducetasks'][0])
         self.opts.append(('numReduceTasks', str(numreducetasks)))
     if addedopts['priority']:
         self.opts.append(('jobconf', 'mapred.job.priority=%s'
                           % addedopts['priority'][0]))
     if addedopts['queue']:
         self.opts.append(('jobconf', 'mapred.job.queue.name=%s'
                           % addedopts['queue'][0]))
     if addedopts['cachefile']:
         for cachefile in addedopts['cachefile']:
             self.opts.append(('cacheFile', cachefile))
     if addedopts['cachearchive']:
         for cachearchive in addedopts['cachearchive']:
             self.opts.append(('cacheArchive', cachearchive))
     if addedopts['file']:
         for file in addedopts['file']:
             if not '://' in file:
                 if not os.path.exists(file):
                     raise ValueError('file "' + file + '" does not exist')
                 file = 'file://' + os.path.abspath(file)
             self.opts.append(('file', file))
     if not addedopts['inputformat']:
         addedopts['inputformat'] = ['auto']
     inputformat_shortcuts = \
         {'code': 'org.apache.hadoop.streaming.AutoInputFormat',
          'text': 'org.apache.hadoop.mapred.TextInputFormat',
          'sequencefile': 'org.apache.hadoop.streaming.AutoInputFormat',
          'auto': 'org.apache.hadoop.streaming.AutoInputFormat'}
     inputformat_shortcuts.update(configopts('inputformats', self.prog))
     inputformat = addedopts['inputformat'][0]
     if inputformat_shortcuts.has_key(inputformat.lower()):
         inputformat = inputformat_shortcuts[inputformat.lower()]
     self.opts.append(('inputformat', inputformat))
     if not addedopts['outputformat']:
         addedopts['outputformat'] = ['sequencefile']
     if addedopts['getpath'] and addedopts['getpath'] != 'no':
         outputformat_shortcuts = \
             {'code': 'fm.last.feathers.output.MultipleSequenceFiles',
              'text': 'fm.last.feathers.output.MultipleTextFiles',               
              'raw': 'fm.last.feathers.output.MultipleRawFileOutputFormat',
              'sequencefile': 'fm.last.feathers.output.MultipleSequenceFiles'}
     else:
         outputformat_shortcuts = \
             {'code': 'org.apache.hadoop.mapred.SequenceFileOutputFormat',
              'text': 'org.apache.hadoop.mapred.TextOutputFormat',
              'raw': 'fm.last.feathers.output.RawFileOutputFormat',
              'sequencefile': 'org.apache.hadoop.mapred.SequenceFileOutputFormat'}
     outputformat_shortcuts.update(configopts('outputformats', self.prog))
     outputformat = addedopts['outputformat'][0]
     if outputformat_shortcuts.has_key(outputformat.lower()):
         outputformat = outputformat_shortcuts[outputformat.lower()]
     self.opts.append(('outputformat', outputformat))
     if addedopts['addpath'] and addedopts['addpath'][0] != 'no':
         self.opts.append(('cmdenv', 'dumbo_addpath=true'))
     pyenv = envdef('PYTHONPATH',
                    addedopts['libegg'],
                    'file',
                    self.opts,
                    shortcuts=dict(configopts('eggs', self.prog)),
                    quote=False,
                    trim=True,
                    extrapaths=addedopts['pypath'])
     if pyenv:
         self.opts.append(('cmdenv', pyenv))
     hadenv = envdef('HADOOP_CLASSPATH', addedopts['libjar'], 'libjar', 
                     self.opts, shortcuts=dict(configopts('jars', self.prog)))
     fileopt = getopt(self.opts, 'file')
     if fileopt:
         tmpfiles = []
         for file in fileopt:
             if file.startswith('file://'):
                 self.opts.append(('file', file[7:]))
             else:
                 tmpfiles.append(file)
         if tmpfiles:
             self.opts.append(('jobconf', 'tmpfiles=' + ','.join(tmpfiles)))
     libjaropt = getopt(self.opts, 'libjar')
     if libjaropt:
         tmpjars = []
         for jar in libjaropt:
             if jar.startswith('file://'):
                 self.opts.append(('file', jar[7:]))
             else:
                 tmpjars.append(jar)
         if tmpjars:
             self.opts.append(('jobconf', 'tmpjars=' + ','.join(tmpjars)))
     cmd = hadoop + '/bin/hadoop jar ' + streamingjar
     retval = execute(cmd, self.opts, hadenv)
     if addedopts['delinputs'] and addedopts['delinputs'][0] == 'yes':
         for (key, value) in self.opts:
             if key == 'input':
                 if os.path.exists(hadoop + "/bin/hdfs"):
                     hdfs = hadoop + "/bin/hdfs"
                 else:
                     hdfs = hadoop + "/bin/hadoop"
                 execute("%s dfs -rmr '%s'" % (hdfs, value))
     return retval