def createJob(self, p): self.job_machine = self.get_machine() use_machine(self.job_machine) self.job = Subshell("remote", command="%s", working_directory=p['workdir'], identifier=p['job_identifier'])
def job_start(self, params): work_dir = params['working_directory'] self.machine = self.get_machine() use_machine(self.machine) self.job = Subshell("remote", params['command'], work_dir) self.job.run() ret = self.job._ret if ret: try: job_id = int(ret.split('\n')[0]) except ValueError: end_machine() raise ModuleError(self, "Error submitting job: %s" % ret) self.set_job_machine(params, self.machine) return params
def call_hadoop(self, arguments, workdir, identifier, machine): self.is_cacheable = lambda *args, **kwargs: False config = self.get_hadoop_config(machine) argList = [config['hadoop']] if type(arguments) in [str, unicode]: argList += arguments.split(' ') elif type(arguments) == list: argList += arguments else: raise ModuleError(self, 'Invalid argument types to hadoop') # 1. this version returns when finished #return subprocess.call(argList) # 2. this version reads the results incrementally # expect = machine.remote._expect_token # machine.remote.push_expect(None) # Do not wait for call to finish # result = machine.remote.send_command(" ".join(argList)).strip() # machine.remote.pop_expect() # restore expect # # We could show the output in a gui # print "**** hadoop streaming running ****" # print result, # while not expect in result: # output = machine.remote.consume_output() # if output: # print output, # result += output # 3. The final version should detach the process on the server use_machine(machine) cdir = CreateDirectory("remote", workdir) job = Subshell("remote", command=" ".join(argList), working_directory=workdir, identifier=identifier, dependencies=[cdir]) job.run() finished = job.finished() if not finished: status = job.status() # The Subshell class provides the JobHandle interface, i.e. # finished() raise ModuleSuspended(self, '%s' % status, handle=job) self.is_cacheable = lambda *args, **kwargs: True return job.standard_error()
def call_hadoop(self, arguments, workdir, identifier, machine): self.is_cacheable = lambda *args, **kwargs: False config = self.get_hadoop_config(machine) argList = [config['hadoop']] if type(arguments) in [str, unicode]: argList += arguments.split(' ') elif type(arguments)==list: argList += arguments else: raise ModuleError(self, 'Invalid argument types to hadoop') # 1. this version returns when finished #return subprocess.call(argList) # 2. this version reads the results incrementally # expect = machine.remote._expect_token # machine.remote.push_expect(None) # Do not wait for call to finish # result = machine.remote.send_command(" ".join(argList)).strip() # machine.remote.pop_expect() # restore expect # # We could show the output in a gui # print "**** hadoop streaming running ****" # print result, # while not expect in result: # output = machine.remote.consume_output() # if output: # print output, # result += output # 3. The final version should detach the process on the server use_machine(machine) cdir = CreateDirectory("remote", workdir) job = Subshell("remote", command=" ".join(argList), working_directory=workdir, identifier=identifier, dependencies=[cdir]) job.run() finished = job.finished() if not finished: status = job.status() # The Subshell class provides the BaseMonitor interface, i.e. # finished() raise ModuleSuspended(self, '%s' % status, monitor=job) self.is_cacheable = lambda *args, **kwargs: True return job.standard_error()
class RunJob(RQModule): """ Run an asynchronous command that can be detached and polled. This is preferable over RunCommand for long-running operations """ _input_ports = [('machine', Machine), ('command', '(edu.utah.sci.vistrails.basic:String)', True), ('working_directory', '(edu.utah.sci.vistrails.basic:String)'), ] _output_ports = [('stdout', '(edu.utah.sci.vistrails.basic:String)'), ('stderr', '(edu.utah.sci.vistrails.basic:String)'), ] job = None def job_read_inputs(self): d = {} if not self.has_input('command'): raise ModuleError(self, "No command specified") d['command'] = self.get_input('command').strip() d['working_directory'] = self.get_input('working_directory') \ if self.has_input('working_directory') else '.' return d def job_start(self, params): work_dir = params['working_directory'] self.machine = self.get_machine() use_machine(self.machine) self.job = Subshell("remote", params['command'], work_dir) self.job.run() ret = self.job._ret if ret: try: job_id = int(ret.split('\n')[0]) except ValueError: end_machine() raise ModuleError(self, "Error submitting job: %s" % ret) self.set_job_machine(params, self.machine) return params def job_get_handle(self, params): if not self.job: self.job_start(params) return self.job def job_finish(self, params): params['stdout'] = self.job.standard_output() params['stderr'] = self.job.standard_error() if self.job.failed(): self.job._pushw() code = self.job.terminal.cat("%s.failed" % self.job._identifier_filename) self.job._popw() end_machine() raise ModuleError(self, "Command failed with exit code %s: %s" % (code.strip(), params['stderr'].strip())) end_machine() return params def job_set_results(self, params): self.set_output('stdout', params['stdout']) self.set_output('stderr', params['stderr'])
class HadoopStreaming(HadoopBaseModule): """ The class for executing MapReduce using Hadoop Streaming with customized Python Mapper/Reducer/Combiner """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('Mapper', File), IPort('Reducer', File), IPort('Combiner', File), IPort('Workdir', String), IPort('Identifier', String), IPort('Input', String), IPort('Output', String), IPort('CacheFile', String), IPort('CacheArchive', String), IPort('Environment', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Output', String)] def __init__(self): HadoopBaseModule.__init__(self) self.job = None self.job_machine = None def job_read_inputs(self): p = {} self.localMapper = self.force_get_input('Mapper') self.localReducer = self.force_get_input('Reducer') self.localCombiner = self.force_get_input('Combiner') p['workdir'] = self.force_get_input('Workdir') if p['workdir']==None: p['workdir'] = ".vistrails-hadoop" p['job_identifier'] = self.force_get_input('Identifier') if p['job_identifier'] == None: raise ModuleError(self, 'Job Identifier is required') p['input'] = self.force_get_input('Input') p['output'] = self.force_get_input('Output') if p['input']==None or p['output']==None: raise ModuleError(self, 'Input and Output are required') p['files'] = self.force_get_input_list('CacheFile') p['cacheArchives'] = self.force_get_input_list('CacheArchive') p['envVars'] = self.force_get_input_list('Environment') return p def createJob(self, p): self.job_machine = self.get_machine() use_machine(self.job_machine) self.job = Subshell("remote", command="%s", working_directory=p['workdir'], identifier=p['job_identifier']) def job_start(self, p): self.createJob(p) if not self.job_machine.remote.isdir(p['workdir']): self.job_machine.remote.mkdir(p['workdir']) self.set_job_machine(p, self.job_machine) self.job.reset() # Now generate the command line config = self.get_hadoop_config(self.job_machine) command = 'jar %s' % config['streaming.jar'] generics = '' arguments = '' if '://' not in p['input']: p['input'] = self.add_prefix(p['input'], self.job_machine) if '://' not in p['output']: p['output'] = self.add_prefix(p['output'], self.job_machine) arguments += ' -input %s -output %s' % (p['input'], p['output']) if self.localMapper!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localMapper.name,tempfile) mapperFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -mapper %s' % mapperFileName else: arguments += ' -mapper org.apache.hadoop.mapred.lib.IdentityMapper' if self.localCombiner!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localCombiner.name, tempfile) combinerFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -combiner %s' % combinerFileName if self.localReducer!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localReducer.name, tempfile) reducerFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -reducer %s' % reducerFileName else: arguments += ' -numReduceTasks 0' for var in p['envVars']: arguments += ' -cmdenv ' + var for cacheArchive in p['cacheArchives']: arguments += ' -cacheArchive %s' % cacheArchive #from init import configuration #if configuration.check('uris') and configuration.uris: # for uri in configuration.uris.split(';'): # p['files'].append(uri) # files is a generic command and needs to be first if p['files']: generics += ' -files ' + ','.join(p['files']) arguments = command + generics + arguments result = self.call_hadoop(arguments, p['workdir'], p['job_identifier'], self.job_machine) return p def job_get_handle(self, p): if not self.job: self.createJob(p) return self.job def job_finish(self, p): r = {} r['output'] = p['output'] r['workdir'] = p['workdir'] r['job_identifier'] = p['job_identifier'] self.annotate({'hadoop_log':self.job.standard_error()}) if self.job.failed(): error = self.job.standard_error() raise ModuleError(self, error) return r def job_set_results(self, p): self.set_output('Output', p['output']) self.set_output('Machine', self.job_machine) def call_hadoop(self, arguments, workdir, identifier, machine): config = self.get_hadoop_config(machine) argList = [config['hadoop']] if type(arguments) in [str, unicode]: argList += arguments.split(' ') elif type(arguments)==list: argList += arguments else: raise ModuleError(self, 'Invalid argument types to hadoop') self.annotate({'hadoop_command':" ".join(argList)}) self.job.command = self.job.command % " ".join(argList) self.job.run()
class RunJob(RQModule): """ Run an asynchronous command that can be detached and polled. This is preferable over RunCommand for long-running operations """ _input_ports = [ ('machine', Machine), ('command', '(edu.utah.sci.vistrails.basic:String)', True), ('working_directory', '(edu.utah.sci.vistrails.basic:String)'), ] _output_ports = [ ('stdout', '(edu.utah.sci.vistrails.basic:String)'), ('stderr', '(edu.utah.sci.vistrails.basic:String)'), ] job = None def job_read_inputs(self): d = {} if not self.has_input('command'): raise ModuleError(self, "No command specified") d['command'] = self.get_input('command').strip() d['working_directory'] = self.get_input('working_directory') \ if self.has_input('working_directory') else '.' return d def job_start(self, params): work_dir = params['working_directory'] self.machine = self.get_machine() use_machine(self.machine) self.job = Subshell("remote", params['command'], work_dir) self.job.run() ret = self.job._ret if ret: try: job_id = int(ret.split('\n')[0]) except ValueError: end_machine() raise ModuleError(self, "Error submitting job: %s" % ret) self.set_job_machine(params, self.machine) return params def job_get_handle(self, params): if not self.job: self.job_start(params) return self.job def job_finish(self, params): params['stdout'] = self.job.standard_output() params['stderr'] = self.job.standard_error() if self.job.failed(): self.job._pushw() code = self.job.terminal.cat("%s.failed" % self.job._identifier_filename) self.job._popw() end_machine() raise ModuleError( self, "Command failed with exit code %s: %s" % (code.strip(), params['stderr'].strip())) end_machine() return params def job_set_results(self, params): self.set_output('stdout', params['stdout']) self.set_output('stderr', params['stderr'])
class HadoopStreaming(HadoopBaseModule): """ The class for executing MapReduce using Hadoop Streaming with customized Python Mapper/Reducer/Combiner """ _settings = ModuleSettings(namespace='hadoop') _input_ports = [IPort('Mapper', File), IPort('Reducer', File), IPort('Combiner', File), IPort('Workdir', String), IPort('Identifier', String), IPort('Input', String), IPort('Output', String), IPort('CacheFile', String), IPort('CacheArchive', String), IPort('Environment', String), IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')] _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'), OPort('Output', String)] def __init__(self): HadoopBaseModule.__init__(self) self.job = None self.job_machine = None def readInputs(self): p = {} self.localMapper = self.force_get_input('Mapper') self.localReducer = self.force_get_input('Reducer') self.localCombiner = self.force_get_input('Combiner') p['workdir'] = self.force_get_input('Workdir') if p['workdir']==None: p['workdir'] = ".vistrails-hadoop" p['job_identifier'] = self.force_get_input('Identifier') if p['job_identifier'] == None: raise ModuleError(self, 'Job Identifier is required') p['input'] = self.force_get_input('Input') p['output'] = self.force_get_input('Output') if p['input']==None or p['output']==None: raise ModuleError(self, 'Input and Output are required') p['files'] = self.force_get_input_list('CacheFile') p['cacheArchives'] = self.force_get_input_list('CacheArchive') p['envVars'] = self.force_get_input_list('Environment') return p def createJob(self, p): self.job_machine = self.get_machine() use_machine(self.job_machine) self.job = Subshell("remote", command="%s", working_directory=p['workdir'], identifier=p['job_identifier']) def startJob(self, p): self.createJob(p) if not self.job_machine.remote.isdir(p['workdir']): self.job_machine.remote.mkdir(p['workdir']) self.set_job_machine(p, self.job_machine) self.job.reset() # Now generate the command line config = self.get_hadoop_config(self.job_machine) command = 'jar %s' % config['streaming.jar'] generics = '' arguments = '' if '://' not in p['input']: p['input'] = self.add_prefix(p['input'], self.job_machine) if '://' not in p['output']: p['output'] = self.add_prefix(p['output'], self.job_machine) arguments += ' -input %s -output %s' % (p['input'], p['output']) if self.localMapper!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localMapper.name,tempfile) mapperFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -mapper %s' % mapperFileName else: arguments += ' -mapper org.apache.hadoop.mapred.lib.IdentityMapper' if self.localCombiner!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localCombiner.name, tempfile) combinerFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -combiner %s' % combinerFileName if self.localReducer!=None: tempfile = self.job_machine.remote.send_command('mktemp').strip() result = self.job_machine.sendfile(self.localReducer.name, tempfile) reducerFileName = os.path.split(tempfile)[1] p['files'].append(tempfile) arguments += ' -reducer %s' % reducerFileName else: arguments += ' -numReduceTasks 0' for var in p['envVars']: arguments += ' -cmdenv ' + var for cacheArchive in p['cacheArchives']: arguments += ' -cacheArchive %s' % cacheArchive #from init import configuration #if configuration.check('uris') and configuration.uris: # for uri in configuration.uris.split(';'): # p['files'].append(uri) # files is a generic command and needs to be first if p['files']: generics += ' -files ' + ','.join(p['files']) arguments = command + generics + arguments result = self.call_hadoop(arguments, p['workdir'], p['job_identifier'], self.job_machine) return p def getMonitor(self, p): if not self.job: self.createJob(p) return self.job def finishJob(self, p): r = {} r['output'] = p['output'] r['workdir'] = p['workdir'] r['job_identifier'] = p['job_identifier'] self.annotate({'hadoop_log':self.job.standard_error()}) if self.job.failed(): error = self.job.standard_error() raise ModuleError(self, error) return r def setResults(self, p): self.set_output('Output', p['output']) self.set_output('Machine', self.job_machine) def call_hadoop(self, arguments, workdir, identifier, machine): config = self.get_hadoop_config(machine) argList = [config['hadoop']] if type(arguments) in [str, unicode]: argList += arguments.split(' ') elif type(arguments)==list: argList += arguments else: raise ModuleError(self, 'Invalid argument types to hadoop') self.annotate({'hadoop_command':" ".join(argList)}) self.job.command = self.job.command % " ".join(argList) self.job.run()
class HadoopStreaming(HadoopBaseModule): """ The class for executing MapReduce using Hadoop Streaming with customized Python Mapper/Reducer/Combiner """ _settings = ModuleSettings(namespace="hadoop") _input_ports = [ IPort("Mapper", File), IPort("Reducer", File), IPort("Combiner", File), IPort("Workdir", String), IPort("Identifier", String), IPort("Input", String), IPort("Output", String), IPort("CacheFile", String), IPort("CacheArchive", String), IPort("Environment", String), IPort("Machine", "(org.vistrails.vistrails.remoteq:Machine)"), ] _output_ports = [OPort("Machine", "(org.vistrails.vistrails.remoteq:Machine)"), OPort("Output", String)] def __init__(self): HadoopBaseModule.__init__(self) self.job = None self.job_machine = None def job_read_inputs(self): p = {} self.localMapper = self.force_get_input("Mapper") self.localReducer = self.force_get_input("Reducer") self.localCombiner = self.force_get_input("Combiner") p["workdir"] = self.force_get_input("Workdir") if p["workdir"] == None: p["workdir"] = ".vistrails-hadoop" p["job_identifier"] = self.force_get_input("Identifier") if p["job_identifier"] is None: raise ModuleError(self, "Job Identifier is required") p["input"] = self.force_get_input("Input") p["output"] = self.force_get_input("Output") if p["input"] == None or p["output"] == None: raise ModuleError(self, "Input and Output are required") p["files"] = self.force_get_input_list("CacheFile") p["cacheArchives"] = self.force_get_input_list("CacheArchive") p["envVars"] = self.force_get_input_list("Environment") return p def createJob(self, p): self.job_machine = self.get_machine() use_machine(self.job_machine) self.job = Subshell("remote", command="%s", working_directory=p["workdir"], identifier=p["job_identifier"]) def job_start(self, p): self.createJob(p) if not self.job_machine.remote.isdir(p["workdir"]): self.job_machine.remote.mkdir(p["workdir"]) self.set_job_machine(p, self.job_machine) self.job.reset() # Now generate the command line config = self.get_hadoop_config(self.job_machine) command = "jar %s" % config["streaming.jar"] generics = "" arguments = "" if "://" not in p["input"]: p["input"] = self.add_prefix(p["input"], self.job_machine) if "://" not in p["output"]: p["output"] = self.add_prefix(p["output"], self.job_machine) arguments += " -input %s -output %s" % (p["input"], p["output"]) if self.localMapper != None: tempfile = self.job_machine.remote.send_command("mktemp").strip() result = self.job_machine.sendfile(self.localMapper.name, tempfile) mapperFileName = os.path.split(tempfile)[1] p["files"].append(tempfile) arguments += " -mapper %s" % mapperFileName else: arguments += " -mapper org.apache.hadoop.mapred.lib.IdentityMapper" if self.localCombiner != None: tempfile = self.job_machine.remote.send_command("mktemp").strip() result = self.job_machine.sendfile(self.localCombiner.name, tempfile) combinerFileName = os.path.split(tempfile)[1] p["files"].append(tempfile) arguments += " -combiner %s" % combinerFileName if self.localReducer != None: tempfile = self.job_machine.remote.send_command("mktemp").strip() result = self.job_machine.sendfile(self.localReducer.name, tempfile) reducerFileName = os.path.split(tempfile)[1] p["files"].append(tempfile) arguments += " -reducer %s" % reducerFileName else: arguments += " -numReduceTasks 0" for var in p["envVars"]: arguments += " -cmdenv " + var for cacheArchive in p["cacheArchives"]: arguments += " -cacheArchive %s" % cacheArchive # from init import configuration # if configuration.check('uris') and configuration.uris: # for uri in configuration.uris.split(';'): # p['files'].append(uri) # files is a generic command and needs to be first if p["files"]: generics += " -files " + ",".join(p["files"]) arguments = command + generics + arguments result = self.call_hadoop(arguments, p["workdir"], p["job_identifier"], self.job_machine) return p def job_get_handle(self, p): if not self.job: self.createJob(p) return self.job def job_finish(self, p): r = {} r["output"] = p["output"] r["workdir"] = p["workdir"] r["job_identifier"] = p["job_identifier"] self.annotate({"hadoop_log": self.job.standard_error()}) if self.job.failed(): error = self.job.standard_error() raise ModuleError(self, error) return r def job_set_results(self, p): self.set_output("Output", p["output"]) self.set_output("Machine", self.job_machine) def call_hadoop(self, arguments, workdir, identifier, machine): config = self.get_hadoop_config(machine) argList = [config["hadoop"]] if type(arguments) in [str, unicode]: argList += arguments.split(" ") elif type(arguments) == list: argList += arguments else: raise ModuleError(self, "Invalid argument types to hadoop") self.annotate({"hadoop_command": " ".join(argList)}) self.job.command = self.job.command % " ".join(argList) self.job.run()