def test_init(): print 'Test init' t = pbs.new_attropl(1) print type(t) print str(t) w = pbs.new_attropl(2) print type(w)
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) if len(sys.argv) < 2: print "Usage: set_property.py <hostname>" sys.exit(1) hostname = sys.argv[1] con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'note' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.SET r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, hostname, attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def submit_get_subfamilies_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(5) attropl[0].name = pbs.ATTR_N attropl[0].value = "FAT-CAT Get Sub-Families: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) job.status_id = 5 job.save() job_id = pbs.pbs_submit(c, attropl, "/clusterfs/ohana/software/fatcat/scripts/get_best_nodes.py", 'web', 'NULL') logger.info("Submitting %s to the grid to get best nodes with id %s" % (job.id, job_id)) if job_id: job.get_best_nodes_pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def get_attropl(self): data = self.parse_pbs() ### First al the keys, except for --resource-list/-l ## and perhaps more later length = len([x for x in data.keys() if x not in ['l']]) if data.has_key('l'): length += len(data['l'].split(',')) attropl = pbs.new_attropl(length) index = 0 for attr, value in data.items(): if attr in ['l']: for v in value.split(','): parts = v.split('=') resource, value = parts[0], "=".join(parts[1:]) attropl[index].name = getattr(pbs, 'ATTR_' + attr) attropl[index].resource = resource attropl[index].value = value index += 1 else: attropl[index].name = getattr(pbs, 'ATTR_' + attr) attropl[index].value = value index += 1 return attropl
def pbs_change_note_clear( self, nodes ): attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].value = '' attributes[0].op = pbs.SET self.verbose_print( '%*s: cleared' % ( 7, 'Note') ) self.pbs_batch( nodes, attributes )
def job_list(ID,status): # Select jobs on EQ ATTR_state condition attribs = pbs.new_attropl(1) attribs[0].name = pbs.ATTR_state attribs[0].value = status attribs[0].op = pbs.EQ jobs = pbs.pbs_selectjob(ID, attribs, 'NULL') return jobs
def test_loop(): print 'Loop' w = pbs.new_attropl(2) w[0].name = 'bas' w[0].value = 'man' w[1].name = 'jaap' w[1].value = 'man' for i in w: print i.name
def pbs_change_state_down( self, nodes ): attributes = pbs.new_attropl(2) attributes[0].name = pbs.ATTR_NODE_state attributes[0].value = 'down' attributes[0].op = pbs.SET attributes[1].name = 'note' attributes[1].value = '' self.verbose_print( '%*s: down' % ( 7, 'State') ) self.verbose_print( '%*s: cleared' % ( 7, 'Note' ) ) self.pbs_batch( nodes, attributes )
def run_cluster(self, pbs_server, job_script, settings): import pbs from threading import threa self.settings = copy.deepcopy(settings) # Launch script, wait for output to come back, return when it does # Create the job options struct attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = "inferno_" + self.name # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = "y" # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = "walltime" attropl[2].value = "400" # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = "nodes" attropl[3].value = "1:ppn=4" # Run the job if pbs_server == None: pbs_server = pbs.pbs_default() job_id = pbs.pbs_submit(pbs_server, attropl, job_script, "NULL", "NULL") e, e_txt = pbs.error() if e: print e, e_txt # Save the job ID for later so we can check on the status self.job_id = job_id # TODO: Change this # Now loop, checking every 5 seconds or so if the job is done by # polling the pbs_server about the jobid. running = True while running: job_info = pbs.pbs_statjob(pbs_server, self.job_id, "NULL", "NULL") print job_info time.sleep(5)
def pbs_change_state_offline( self, nodes, note, ticket=None ): attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_state attributes[0].value = 'offline' attributes[0].op = pbs.SET note_attributes = { 'note': note, 'ticket': ticket, 'mode': 'a' } self.verbose_print( '%*s: offline' % ( 7, 'State') ) self.verbose_print( '%*s: %s' % ( 7, 'Note', note ) ) if ticket: self.verbose_print( '%*s: %s' % ( 7, 'Ticket', ticket ) ) self.pbs_batch( nodes, attributes, note_attributes )
def submit_fxn_site_prediction_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) print server_name print c attropl = pbs.new_attropl(7) attropl[0].name = pbs.ATTR_N attropl[0].value = "Functional Site Prediction Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=1' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_r attropl[5].value = 'y' attropl[6].name = pbs.ATTR_l attropl[6].resource = 'walltime' attropl[6].value = '1000' job.status_id = 2 job.save() job_id = pbs.pbs_submit( c, attropl, "/home/cyrus_afrasiabi/ohana_repository/bpg/fxn_site_prediction.py", 'web', 'NULL') logger.info( "Submitting %s to the grid to get functional site predictions with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.save() pbs.pbs_disconnect(c) return job_id
def submit_intrepid_job(job): # This is how we are passing the fasta and job id to the script server_name = pbs.pbs_default() c = pbs.pbs_connect(server_name) attropl = pbs.new_attropl(6) attropl[0].name = pbs.ATTR_N attropl[0].value = "INTREPID Job: %s" % job.id attropl[1].name = pbs.ATTR_l attropl[1].resource = 'nodes' attropl[1].value = '1:ppn=8' attropl[2].name = pbs.ATTR_o attropl[2].value = JOB_LOG_FILE attropl[3].name = pbs.ATTR_e attropl[3].value = JOB_LOG_FILE attropl[4].name = pbs.ATTR_v attropl[4].value = "job_id=%s" % (job.id) attropl[5].name = pbs.ATTR_l attropl[5].resource = 'walltime' attropl[5].value = '48:00:00' if job.development_job: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_development_pipeline.py", 'web', 'NULL') else: job_id = pbs.pbs_submit( c, attropl, "/clusterfs/ohana/software/intrepid/scripts/intrepid_pipeline.py", 'web', 'NULL') logger.info("Submitting %s to the grid with id %s" % (job.id, job_id)) if job_id: job.pbs_job_id = job_id job.status_id = JOB_SUBMITTED job.save() else: pass pbs.pbs_disconnect(c) return job_id
def _job2AttrOp(self, job): attr = pbs.new_attropl(4) attr[0].name = pbs.ATTR_N attr[0].value = job.name attr[1].name = pbs.ATTR_l attr[1].resource = 'walltime' attr[1].value = str(job.walltime) attr[2].name = pbs.ATTR_l attr[2].resource = 'nodes' attr[2].value = str(job.nodes) + ":ppn=" + str(job.ppn) attr[3].name = pbs.ATTR_t attr[3].value = str(job.range[0]) + "-" + str(job.range[1]) return attr
def submitScript(script): result = {} try: pbs_connection = pbs.pbs_connect(pbs.pbs_default()) # queues = pbs.pbs_statque(pbs_connection, "batch", "NULL", "NULL") attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = str(script['jobName']) if script['jobName'] else "new_job" # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = 'y' # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = 'walltime' attropl[2].value = str(script['maxTime']) if script['maxTime'] else '01:00:00' # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = 'nodes' attropl[3].value = '1:ppn=' + str(script['cpuNumber']) if script['cpuNumber'] else '1' # A1.tsk is the job script filename # job_id = pbs.pbs_submit(pbs_connection, attropl, str(script['scriptName']), str(script['queue']), 'NULL') e, e_txt = pbs.error() if e: result['Result'] = 'ERROR' result['Message'] = str(e) + ' : ' + e_txt else: result['Result'] = 'OK' result['Message'] = job_id except Exception as exc: result['Result'] = 'ERROR' result['Message'] = str(exc) return result
def test_getitem(): print 'Test getitem' w = pbs.new_attropl(2) b = w[0] c = w[1] b.name = 'bas' b.value = 'vlies' b.op = pbs.INCR print type(b) print 'b', str(b) c.name = 'cbassssssssssss' c.value = 'cvlies' c.op = pbs.DECR print type(c) print 'c', str(c), repr(c)
def __make_pbs_attrs(resources, attributes): pbs_attrs = pbs.new_attropl(len(attributes) + len(resources)) # populate pbs_attrs attr_idx = 0 for resource, val in resources.iteritems(): pbs_attrs[attr_idx].name = pbs.ATTR_l pbs_attrs[attr_idx].resource = resource pbs_attrs[attr_idx].value = str(val) attr_idx += 1 for attribute, val in attributes.iteritems(): pbs_attrs[attr_idx].name = attribute pbs_attrs[attr_idx].value = str(val) attr_idx += 1 return pbs_attrs
def do_clear_note(self, nodes): '''Clear the note on the node(s)''' if ARGS_VERBOSE: _print('class:SaraNodes func:do_clear_note input:%s' % str(nodes), file=sys.stderr) attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].op = pbs.SET attributes[0].value = '' batch_list = list() ## again a loop, now create the attrib dict list for node in nodes: batch_list.append(tuple([node, attributes])) self._process(batch_list)
def do_modify(self, nodes, note): '''Modify the note on a node, override the previous note''' if ARGS_VERBOSE: _print('class:SaraNodes func:do_modify input:%s,%s' % (str(nodes), note), file=sys.stderr) attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].op = pbs.SET batch_list = list() ## again a loop, now create the attrib dict list for node, note in self._generate_note(nodes, note, append=False).items(): attributes[0].value = note batch_list.append(tuple([node, attributes])) self._process(batch_list)
def main(): pbs_server = pbs.pbs_default() if not pbs_server: print 'No default server' sys.exit(1) con = pbs.pbs_connect(pbs_server) attrop_l = pbs.new_attropl(1) attrop_l[0].name = 'properties' attrop_l[0].value = 'set_something_useful' attrop_l[0].op = pbs.INCR r = pbs.pbs_manager(con, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, "e2", attrop_l, 'NULL') if r > 0: print r, ";" errno, text = pbs.error() print errno, text
def do_offline(self, nodes, note): '''Change the state of node(s) to offline with a specific note''' if ARGS_VERBOSE: _print('class:SaraNodes func:do_offline input:%s,%s' % (str(nodes), note), file=sys.stderr) attributes = pbs.new_attropl(2) attributes[0].name = pbs.ATTR_NODE_state attributes[0].value = 'offline' attributes[0].op = pbs.SET attributes[1].name = pbs.ATTR_NODE_note attributes[1].op = pbs.SET batch_list = list() ## again a loop, now create the attrib dict list for node, note in self._generate_note(nodes, note).items(): attributes[1].value = note batch_list.append(tuple([node, attributes])) self._process(batch_list)
def create_job(self, username, Job_Name, queue, nodes, walltime, file): c = pbs.pbs_connect(str(pbs.pbs_default())) attrl = pbs.new_attropl(3) attrl[0].name = pbs.ATTR_N attrl[0].value = str(Job_Name) attrl[1].name = pbs.ATTR_l attrl[1].resource = 'nodes' attrl[1].value = str(nodes) attrl[2].name = pbs.ATTR_l attrl[2].resource = 'walltime' attrl[2].value = str(walltime) queue = str(queue) task_id = pbs.pbs_submit(c, attrl, str("media/" + username + "/" + file), queue, 'NULL') return pbs.pbs_geterrmsg(c)
def node_list_by_job(ID,status,id_job): machines = [] attribs = pbs.new_attropl(1) if id_job == "": attribs[0].name = pbs.ATTR_state attribs[0].value = status attribs[0].op = pbs.EQ else: attribs[0].name = pbs.ATTR_N attribs[0].value = job_id attribs[0].op = pbs.EQ jobs = pbs.pbs_selectjob(ID, attribs, 'NULL') if status == "W": print "Numero Jobs Bloccati:",len(jobs), "\n" if status == "R": print "Numero Jobs Attivi sul sistema:",len(jobs), "\n" if status == "Q": print "Numero Jobs in coda:",len(jobs), "\n" print "Attendere prego ..." log = open("logs/result.txt","w") for i in jobs: py_mach = stat_job.statjob(ID,i) py_tot = i, "->" ,py_mach py_tot2 = i, "->" ,py_mach,"\r\n" log.write(str(py_tot2)) machines.append(py_tot) log.close() return machines
def get_pbs_attr(self, job_id, config): """tool configuration should either have pbs_comments or pbs pbs_comments are just comments used in PBS script """ if 'pbs_comments' in config: pbs_config = self.parse_pbs_comment(config['pbs_comments']) else: assert('pbs' in config) pbs_config = config['pbs'] num = len(pbs_config) attropl = pbs.new_attropl(num + 2) for i, item in enumerate(pbs_config): attropl[i].name = item[0].encode('ascii', 'ignore') if attropl[i].name == pbs.ATTR_l: attropl[i].resource = item[1].encode('ascii', 'ignore') attropl[i].value = item[2].encode('ascii', 'ignore') else: attropl[i].value = item[1].encode('ascii', 'ignore') attropl[num].name = pbs.ATTR_o attropl[num].value = main.DATA_SOURCE_DIR + "%d.out" % job_id attropl[num + 1].name = pbs.ATTR_e attropl[num + 1].value = main.DATA_SOURCE_DIR + "%d.err" % job_id return attropl
def _submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(3) pbs_attributes[0].name = pbs.ATTR_N # Job_Name pbs_attributes[0].value = self.name output_dir = build_option('job_output_dir') pbs_attributes[1].name = pbs.ATTR_o pbs_attributes[1].value = os.path.join(output_dir, '%s.o$PBS_JOBID' % self.name) pbs_attributes[2].name = pbs.ATTR_e pbs_attributes[2].value = os.path.join(output_dir, '%s.e$PBS_JOBID' % self.name) # set resource requirements resource_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resource_attributes[idx].name = pbs.ATTR_l # Resource_List resource_attributes[idx].resource = k resource_attributes[idx].value = v idx += 1 pbs_attributes.extend(resource_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join( ["afterany:%s" % dep.jobid for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # submit job with (user) hold hold_attributes = pbs.new_attropl(1) hold_attributes[0].name = pbs.ATTR_h hold_attributes[0].value = pbs.USER_HOLD pbs_attributes.extend(hold_attributes) self.holds.append(pbs.USER_HOLD) self.log.debug("Job hold attributes: %s" % hold_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = [ "PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars ] # extend PBS variables with specified variables pbsvars.extend([ "%s=%s" % (name, value) for (name, value) in self.env_vars.items() ]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = pbs.ATTR_v # Variable_List variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = pbs.ATTR_m # Mail_Points mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, NULL) is_error, errormsg = pbs.error() if is_error or jobid is None: raise EasyBuildError( "Failed to submit job script %s (job id: %s, error %s)", scriptfn, jobid, errormsg) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
bit = random.randint(97,122) else: bit = random.randint(65,90) job_name += chr(bit) # set up the job file script = "#!/bin/sh\nPATH='%s'\ncd %s\n%s\n" % (os.environ['PATH'], os.getcwd(), command_line) job_file = "%s/database/pbs/%s.sh" % (os.getcwd(), job_name) fh = file(job_file, "w") fh.write(script) fh.close() # define job attributes ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job_name) efile = "%s/database/pbs/%s.e" % (os.getcwd(), job_name) job_attrs = pbs.new_attropl(2) job_attrs[0].name = pbs.ATTR_o job_attrs[0].value = ofile job_attrs[1].name = pbs.ATTR_e job_attrs[1].value = efile # get a handle conn = pbs.pbs_connect(pbs_server) # queue it if os.access(job_file, os.R_OK): log.debug("submitting file %s with output %s and error %s" % (job_file, ofile, efile) ) log.debug("command is: %s" % command_line) job_id = pbs.pbs_submit(conn, job_attrs, job_file, None, None) # monitor
else: bit = random.randint(65, 90) job_name += chr(bit) # set up the job file script = "#!/bin/sh\nPATH='%s'\ncd %s\n%s\n" % ( os.environ['PATH'], os.getcwd(), command_line) job_file = "%s/database/pbs/%s.sh" % (os.getcwd(), job_name) fh = file(job_file, "w") fh.write(script) fh.close() # define job attributes ofile = "%s/database/pbs/%s.o" % (os.getcwd(), job_name) efile = "%s/database/pbs/%s.e" % (os.getcwd(), job_name) job_attrs = pbs.new_attropl(2) job_attrs[0].name = pbs.ATTR_o job_attrs[0].value = ofile job_attrs[1].name = pbs.ATTR_e job_attrs[1].value = efile # get a handle conn = pbs.pbs_connect(pbs_server) # queue it if os.access(job_file, os.R_OK): log.debug( "submitting file %s with output %s and error %s" % (job_file, ofile, efile)) log.debug("command is: %s" % command_line) job_id = pbs.pbs_submit(conn, job_attrs, job_file, None,
def submitJob ( self, conn, job, task=None, requirements=''): """ Need to copy the inputsandbox to WN before submitting a job""" # Write a temporary submit script # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host ifiles=task['globalSandbox'].split(',') f=tempfile.NamedTemporaryFile() s=[] s.append('#!/bin/sh'); if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi') s.append('cd $PBS_JOBCOOKIE') for ifile in task['globalSandbox'].split(','): s.append('cp '+ifile+' .') s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\ ' >' + job['standardOutput'] + ' 2>' + job['standardError']) if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make # sure it's gone for good - AMM 18/07/2011 s.append('rm -fr $PBS_JOBCOOKIE') f.write('\n'.join(s)) f.flush() epilogue = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh'); if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('rm -fr $PBS_JOBCOOKIE') s.append('touch $HOME/done.$1') epilogue.write( '\n'.join( s ) ) epilogue.flush() os.chmod( epilogue.name, 700 ) attr_dict={'Job_Name':'CRAB_PBS', 'Variable_List':self.pbs_env, 'Output_Path':self.jobResDir+'wrapper_'+str(job['standardOutput']), 'Error_Path':self.jobResDir+'wrapper_'+str(job['standardError']) } attropl=pbs.new_attropl(len(attr_dict)+len(self.res_dict) + 1) i_attr=0 for k in attr_dict.keys(): self.logging.debug("adding k %s" % k) attropl[i_attr].name=k attropl[i_attr].value=attr_dict[k] i_attr+=1 for k in self.res_dict.keys(): attropl[i_attr].name='Resource_List' attropl[i_attr].resource=k attropl[i_attr].value=self.res_dict[k] i_attr+=1 attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = 'epilogue' attropl[i_attr].value = epilogue.name self.logging.debug("adding epilogue: %s" % epilogue.name) i_attr += 1 jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL') f.close() if not jobid: err, err_text=pbs.error() self.logging.error('Error in job submission') self.logging.error('PBS error code '+str(err)+': '+err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err)+': '+err_text) return {job['name']:jobid}, None, None
def queue_job(self, job_wrapper): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not (self.app.config.pbs_stage_path)): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop( '-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params[ 'destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error("Connection to PBS server for submit failed: %s: %s" % (errno, text)) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [str(o) for o in output_fnames] output_files.append(ecfile) stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() + output_files, symlink=True) stageout = self.get_stage_in_out(output_files) attrs = [ dict(name=pbs.ATTR_o, value=pbs_ofile), dict(name=pbs.ATTR_e, value=pbs_efile), dict(name=pbs.ATTR_stagein, value=stagein), dict(name=pbs.ATTR_stageout, value=stageout), ] # If not, we're using NFS else: attrs = [ dict(name=pbs.ATTR_o, value=ofile), dict(name=pbs.ATTR_e, value=efile), ] # define PBS job options attrs.append( dict(name=pbs.ATTR_N, value=str("%s_%s_%s" % (job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user)))) job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options)) for i, attr in enumerate(attrs + pbs_options): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath(job_wrapper.working_directory) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with file(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join(job_wrapper.get_input_fnames() + output_files), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [stage_commands] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script(job_file, script) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id) pbs.pbs_disconnect(c) if self.app.config.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file)) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) time.sleep(2) else: log.error("(%s) All attempts to submit job failed" % galaxy_job_id) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id)) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id)) # persist destination job_wrapper.set_job_destination(job_destination, job_id) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(job_state)
w[1].name = 'jaap' w[1].value = 'man' for i in w: print i.name test_init() test_getitem() test_loop() print "Testing" pbs_server = pbs.pbs_default() con = pbs.pbs_connect(pbs_server) z = pbs.new_attropl(2) z[0].name = pbs.ATTR_u z[0].value = 'a403vink,zuidema' z[0].op = pbs.EQ z[1].name = pbs.ATTR_N z[1].value = 'runRollCV' z[1].op = pbs.EQ q = pbs.new_attropl(1) q[0].name = pbs.ATTR_u q[0].value = 'zuidema' q[0].op = pbs.EQ combine = z + q jobs = pbs.pbs_selectjob(con, z, "NULL")
import pbs pbs_server = pbs.pbs_default() pbsconn = pbs.pbs_connect(pbs_server) print pbsconn attrl = pbs.new_attropl(1) attrl[0].name = pbs.ATTR_N attrl[0].value = "test" task_id = pbs.pbs_submit(pbsconn, attrl, "A1.tsk", 'NULL', 'NULL') e, e_txt = pbs.error() if e: print e, e_txt print task_id
def pbs_batch( self, nodes, attrs=None, note_attributes=None ): nodeserror = list() if not attrs and not note_attributes: raise sara_nodesException, 'attrs and note_attributes can not be empty together!' if not self.dryrun: if note_attributes and len( note_attributes ) == 3: if attrs: attributes = attrs + pbs.new_attropl(1) attributes[1].name = pbs.ATTR_NODE_note attributes[1].op = pbs.SET else: attributes = pbs.new_attropl(1) attributes[0].name = pbs.ATTR_NODE_note attributes[0].op = pbs.SET else: attributes = attrs # Some hacking here because some limitation in the Torque 2.4 version # fetching note data first for all nodes! tmp_node_note = dict() for node in nodes: if note_attributes and len( note_attributes ) == 3: tmp_node_note[ node ] = self.note( node, note_attributes ) pbs_server = pbs.pbs_default() if not pbs_server: raise sara_nodesException, 'Default pbs server not found!' pbs_connection = pbs.pbs_connect( pbs_server ) for node in nodes: if note_attributes and len( note_attributes ) == 3: try: if attrs: attributes[1].value = tmp_node_note[ node ] else: attributes[0].value = tmp_node_note[ node ] except KeyError: pass rcode = pbs.pbs_manager( pbs_connection, pbs.MGR_CMD_SET, pbs.MGR_OBJ_NODE, node, attributes, 'NULL' ) if rcode > 0: errno, text = pbs.error() nodeserror.append( '%s: %s (%s)' % ( node, text, errno ) ) else: p = PBSQuery.PBSQuery() pbsnodes = p.getnodes().keys() print '%*s:' % ( 7, 'Nodes' ), firstitem = True for node in nodes: if node in pbsnodes: if firstitem: print '%s' % node firstitem = False else: print '%*s' % ( 17, node ) else: nodeserror.append( '%s: does not exist' % node ) if len( nodeserror ) > 0: raise sara_nodesException, nodeserror
def submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = 'Job_Name' pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = 'Resource_List' resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = 'Variable_List' variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = 'Mail_Points' mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) import tempfile fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # extend paramater should be 'NULL' because this is required by the python api extend = 'NULL' jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend) is_error, errormsg = pbs.error() if is_error: self.log.error("Failed to submit job script %s: error %s" % (scriptfn, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def submit(self): attropl = pbs.new_attropl(self.attribute_count + 1) attropl_idx = 0 attropl[attropl_idx].name = pbs.ATTR_v attropl[attropl_idx].value = self.generate_env() attropl_idx += 1 if self.name: attropl[attropl_idx].name = pbs.ATTR_N attropl[attropl_idx].value = self.name attropl_idx += 1 if self.walltime: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'walltime' attropl[attropl_idx].value = self.walltime attropl_idx += 1 if self.nodes: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'nodes' attropl[attropl_idx].value = self.nodes attropl_idx += 1 if self.stdout_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stdout_path attropl_idx += 1 if self.stderr_path: attropl[attropl_idx].name = pbs.ATTR_o attropl[attropl_idx].value = self.stderr_path attropl_idx += 1 if self.dependency_list: attropl[attropl_idx].name = pbs.ATTR_depend attropl[attropl_idx].value = self.dependency_list attropl_idx += 1 if self.mail_options: attropl[attropl_idx].name = pbs.ATTR_m attropl[attropl_idx].value = self.mail_options attropl_idx += 1 if self.mem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'mem' attropl[attropl_idx].value = self.mem attropl_idx += 1 if self.vmem: attropl[attropl_idx].name = pbs.ATTR_l attropl[attropl_idx].resource = 'vmem' attropl[attropl_idx].value = self.vmem attropl_idx += 1 connection = pbs.pbs_connect(pbs.pbs_default()) self.job_id = pbs.pbs_submit(connection, attropl, self.job_script, None, None) pbs.pbs_disconnect(connection) e, e_msg = pbs.error() # the batch system returned an error, throw exception if e: message = "%d: %s" % (e, e_msg) raise Exception(message) return self.job_id
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools=None, job_server="", ncpus=8, logger=None, max_time=None, fg_file=None, bg_file=None): if tools is None: tools = {} config = MotifConfig() if not tools: tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")]) #logger = logging.getLogger('prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large","xl"]: step = 2 wmin = 6 analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" jobs = {} result = PredictionResult(outfile, logger=logger, fg_file=fg_file, bg_file=bg_file) # Dynamically load all tools toolio = [x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass(x, tool_classes.MotifProgram) ) if x[0] != 'MotifProgram'] # TODO: # Add warnings for running time: Weeder GADEM # Prepare PBS submission server = pbs.pbs_default() c = pbs.pbs_connect(server) q = PBSQuery() attropl = pbs.new_attropl(6) # Name attropl[0].name = pbs.ATTR_N # Restartable attropl[1].name = pbs.ATTR_r attropl[1].value = 'y' # Walltime attropl[2].name = pbs.ATTR_l attropl[2].resource = 'walltime' attropl[2].value = '600' # Node requirements attropl[3].name = pbs.ATTR_l attropl[3].resource = 'nodes' attropl[3].value = '1:ppn=1' # attropl[4].name = pbs.ATTR_o attropl[5].name = pbs.ATTR_e rundir = os.path.join(os.path.split(os.path.abspath(fastafile))[0], "torque") if not os.path.exists(rundir): os.mkdir(rundir) params = { 'analysis': analysis, 'background':background, "single":single, "organism":organism } jobs = {} for t in toolio: if tools.has_key(t.name) and tools[t.name]: if t.use_width: for i in range(wmin, wmax + 1, step): logger.info("Starting %s job, width %s" % (t.name, i)) params['width'] = i sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params) job_name = os.path.basename(os.path.splitext(sh)[0]) # submit attropl[0].value = job_name attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name) attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name) job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL') e, e_txt = pbs.error() if e: logger.error("Failed: {0}".format(e_txt)) else: jobs[job_id] = job_name else: logger.debug("Starting %s job" % t.name) sh = write_shell_script(t.name, fastafile, rundir=rundir, params=params) job_name = os.path.basename(os.path.splitext(sh)[0]) # submit attropl[0].value = job_name attropl[4].value = "{0}/{1}.stdout".format(rundir, job_name) attropl[5].value = "{0}/{1}.stderr".format(rundir, job_name) job_id = pbs.pbs_submit(c, attropl, sh, "batchq", 'NULL') e, e_txt = pbs.error() if e: logger.error("Failed submission: {0}".format(e_txt)) else: jobs[job_id] = job_name else: logger.debug("Skipping %s" % t.name) ### Wait until all jobs are finished or the time runs out ### start_time = time() try: # Run until all jobs are finished while len(jobs) > 0 and not(max_time) or time() - start_time < max_time: for job_id,job_name in jobs.items(): job = q.getjob(job_id) if not job: # or not job.is_running(): motifs = [] if job: name = job['Job_Name'] # Some error checking here! else: pwmfile = os.path.join(rundir, "{0}.pwm".format(job_name)) if os.path.exists(pwmfile): motifs = read_motifs(open(pwmfile), fmt="pwm") else: logger.error("Job {0} finished, but couldn find {1}!".format(job_name, pwmfile)) stdout = open(os.path.join(rundir, "{0}.stdout".format(job_name))).read() stderr = open(os.path.join(rundir, "{0}.stderr".format(job_name))).read() result.add_motifs(job_id, (motifs, stdout, stderr)) #for fname in glob.glob("{0}*".format(job_name)): # logger.debug("Deleting {0}".format(fname)) # #os.unlink(fname) del jobs[job_id] sleep(5) ### Or the user gets impatient... ### except KeyboardInterrupt, e: # Destroy all running jobs logger.info("Caught interrupt, destroying all running jobs")
def submit_pbs(self, name, taskfile, lastid=None): targets = self.targets target = targets[name] subenv = target["env"].asdict() target["attrs"].setdefault(pbs.ATTR_N, name) # Just include all variables by default varlist = ",".join("%s=%s" % (k, v) for k, v in subenv.iteritems()) target["attrs"].setdefault(pbs.ATTR_v, varlist) # Track job dependencies dependencies = [] dep_type = name.partition("::")[-1] or "afterok" for dep in target["components"]: dependencies.append("%s:%s" % (dep_type, targets[dep]["torqueid"])) if lastid: dependencies.append("%s:%s" % (dep_type, lastid)) if dependencies: target["attrs"][pbs.ATTR_depend] = ",".join(dependencies) # /bin/sh as a default shell will generally do the right thing. # It honors #! syntax at the beginning of the file and it # interprets basic commands without a #! at the beginning of # the file. Obscure users can opt for other shells # (eg: bash,csh,ksh,python,...) via the standard #! syntax # -- This default ensures users with non-standard shells # can still use pbsmake files from other users. target["attrs"].setdefault(pbs.ATTR_S, "/bin/sh") # We need to handle ATTR_l specially. Each resource needs its own # attropl with the name pbs.ATTR_l: attr_l = [] if pbs.ATTR_l in target["attrs"]: attr_l = target["attrs"][pbs.ATTR_l].split(",") del (target["attrs"][pbs.ATTR_l]) # Attach attributes to job as the pbs module expects it attropl = pbs.new_attropl(len(target["attrs"]) + len(attr_l)) i = 0 for n in target["attrs"]: attropl[i].name = n attropl[i].value = target["env"].interp(target["attrs"][n], defer=False) i += 1 for n in attr_l: attropl[i].name = pbs.ATTR_l res, val = n.split("=", 1) attropl[i].resource = res attropl[i].value = target["env"].interp(val, defer=False) i += 1 try: destination = target["attrs"]["queue"] except KeyError: destination = "" # attempt to submit job lastid = pbs.pbs_submit(self.conn, attropl, taskfile.name, destination, "") if lastid: target["torqueid"] = lastid else: print "Error submitting job: %s\n\tAttributes:" % name for attr, val in target["attrs"].items(): print "\t\t%s: %s" % (attr, val) raise Exception(pbs.error()) return lastid
def submit(self, with_hold=False): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = pbs.ATTR_N # Job_Name pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = pbs.ATTR_l # Resource_List resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # submit job with (user) hold if requested if with_hold: hold_attributes = pbs.new_attropl(1) hold_attributes[0].name = pbs.ATTR_h hold_attributes[0].value = pbs.USER_HOLD pbs_attributes.extend(hold_attributes) self.holds.append(pbs.USER_HOLD) self.log.debug("Job hold attributes: %s" % hold_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = pbs.ATTR_v # Variable_List variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = pbs.ATTR_m # Mail_Points mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, NULL) is_error, errormsg = pbs.error() if is_error or jobid is None: self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)
def queue_job( self, job_wrapper ): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop('-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) ) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [ str( o ) for o in output_fnames ] output_files.append(ecfile) stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True ) stageout = self.get_stage_in_out( output_files ) attrs = [ dict( name=pbs.ATTR_o, value=pbs_ofile ), dict( name=pbs.ATTR_e, value=pbs_efile ), dict( name=pbs.ATTR_stagein, value=stagein ), dict( name=pbs.ATTR_stageout, value=stageout ), ] # If not, we're using NFS else: attrs = [ dict( name=pbs.ATTR_o, value=ofile ), dict( name=pbs.ATTR_e, value=efile ), ] # define PBS job options attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) ) job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) ) for i, attr in enumerate( attrs + pbs_options ): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath( job_wrapper.working_directory ) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join( job_wrapper.get_input_fnames() + output_files ), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [ stage_commands ] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script( job_file, script ) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id ) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): self.cleanup( ( ofile, efile, ecfile, job_file ) ) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) ) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) ) time.sleep(2) else: log.error( "(%s) All attempts to submit job failed" % galaxy_job_id ) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) ) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) ) # persist destination job_wrapper.set_job_destination( job_destination, job_id ) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( job_state )
def submitJob(self, conn, job, task=None, requirements=''): """ Need to copy the inputsandbox to WN before submitting a job""" # Write a temporary submit script # NB: we assume an env var PBS_JOBCOOKIE points to the exec dir on the batch host ifiles = task['globalSandbox'].split(',') f = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh') if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append( 'if [ ! -d $PBS_JOBCOOKIE ] ; then mkdir -p $PBS_JOBCOOKIE ; fi') s.append('cd $PBS_JOBCOOKIE') for ifile in task['globalSandbox'].split(','): s.append('cp ' + ifile + ' .') s.append(self.jobScriptDir + job['executable']+' '+ job['arguments'] +\ ' >' + job['standardOutput'] + ' 2>' + job['standardError']) if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) # this fails if the job is aborted, which leaks disc space. Adding an epilogue to make # sure it's gone for good - AMM 18/07/2011 s.append('rm -fr $PBS_JOBCOOKIE') f.write('\n'.join(s)) f.flush() epilogue = tempfile.NamedTemporaryFile() s = [] s.append('#!/bin/sh') if self.workerNodeWorkDir: s.append('cd ' + self.workerNodeWorkDir) s.append('rm -fr $PBS_JOBCOOKIE') s.append('touch $HOME/done.$1') epilogue.write('\n'.join(s)) epilogue.flush() os.chmod(epilogue.name, 700) attr_dict = { 'Job_Name': 'CRAB_PBS', 'Variable_List': self.pbs_env, 'Output_Path': self.jobResDir + 'wrapper_' + str(job['standardOutput']), 'Error_Path': self.jobResDir + 'wrapper_' + str(job['standardError']) } attropl = pbs.new_attropl(len(attr_dict) + len(self.res_dict) + 1) i_attr = 0 for k in attr_dict.keys(): self.logging.debug("adding k %s" % k) attropl[i_attr].name = k attropl[i_attr].value = attr_dict[k] i_attr += 1 for k in self.res_dict.keys(): attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = k attropl[i_attr].value = self.res_dict[k] i_attr += 1 attropl[i_attr].name = 'Resource_List' attropl[i_attr].resource = 'epilogue' attropl[i_attr].value = epilogue.name self.logging.debug("adding epilogue: %s" % epilogue.name) i_attr += 1 jobid = pbs.pbs_submit(conn, attropl, f.name, self.queue, 'NULL') f.close() if not jobid: err, err_text = pbs.error() self.logging.error('Error in job submission') self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) return {job['name']: jobid}, None, None
def submit(self, txt): """Submit the jobscript txt, set self.jobid""" self.log.debug("Going to submit script %s", txt) attropl = pbs.new_attropl(2) # jobparams attropl[0].name = 'Job_Name' attropl[0].value = self.options.get('name', 'python_pbs_job') attropl[1].name = 'Rerunable' attropl[1].value = 'y' for arg in self.args.keys(): tmp = self.args[arg] tmpattropl = pbs.new_attropl(len(tmp)) # jobparams if arg in ('resources',): idx = 0 for k, v in tmp.items(): tmpattropl[idx].name = 'Resource_List' # resources tmpattropl[idx].resource = k tmpattropl[idx].value = v idx += 1 elif arg in ('mail',): tmpattropl[0].name = 'Mail_Points' tmpattropl[0].value = tmp['send'] if len(tmp) > 1: tmpattropl[0].name = "Mail_Users" tmpattropl[0].value = tmp['others'] elif arg in ('queue',): # # use destination field of pbs_submit pass elif arg in ('account',): tmpattropl = pbs.new_attropl(1) tmpattropl[0].name = pbs.ATTR_A tmpattropl[0].value = tmp #continue else: self.log.error('Unknown arg %s', arg) tmpattropl = pbs.new_attropl(0) attropl.extend(tmpattropl) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] tmpattropl = pbs.new_attropl(1) tmpattropl[0].name = 'Variable_List' tmpattropl[0].value = ",".join(["PBS_O_%s=%s" % ( x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars]) attropl.extend(tmpattropl) fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temp jobscript to %s" % scriptfn) f.write(txt) f.close() queue = self.args.get( 'queue', self.options.get('queue', '')) # do not set with attropl if queue: self.log.debug("Going to submit to queue %s", queue) else: self.log.debug("No queue specified. Will submit to default destination.") extend = 'NULL' # always jobid = pbs.pbs_submit(self.pbsconn, attropl, scriptfn, queue, extend) is_error, errormsg = pbs.error() if is_error: self.log.error("Failed to submit job script %s: error %s", scriptfn, errormsg) else: self.log.debug("Succesful jobsubmission returned jobid %s", jobid) self.jobid = jobid os.remove(scriptfn)
def queue_job(self, batch_job): """ queue a BatchJob. :param batch_job: description of the job to queue """ # batch job names should be unique for civet pipelines because the # job name is used to name log files and other output # Civet generates unique names for each step, so this is just checking # for a programming error assert batch_job.name not in self._job_names if self.execution_log_dir: log_dir = self.execution_log_dir else: log_dir = self.log_dir # set batch_job.stderr_path and batch_job.stdout_path if they aren't already set if not batch_job.stdout_path: batch_job.stdout_path = os.path.join(log_dir, batch_job.name + ".o") if not batch_job.stderr_path: batch_job.stderr_path = os.path.join(log_dir, batch_job.name + ".e") # write batch script filename = self.write_script(batch_job) if self.submit: # build up our torque job attributes and resources job_attributes = {} job_resources = { 'nodes': "{0}:ppn={1}".format(batch_job.nodes, batch_job.ppn), 'walltime': batch_job.walltime, 'epilogue': self.epilogue_filename } if batch_job.mem: job_resources['mem'] = batch_job.mem job_attributes[pbs.ATTR_v] = self.generate_env(batch_job.workdir) if batch_job.name: job_attributes[pbs.ATTR_N] = batch_job.name job_attributes[pbs.ATTR_o] = batch_job.stdout_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_o] == "/dev/null": job_attributes[ pbs.ATTR_o] = socket.gethostname() + ":/dev/null" job_attributes[pbs.ATTR_e] = batch_job.stderr_path #XXX workaround for a TORQUE bug where local copies of stderr & # stdout files to /dev/null don't work correctly but remote # copies (to submit host) do if job_attributes[pbs.ATTR_e] == "/dev/null": job_attributes[ pbs.ATTR_e] = socket.gethostname() + ":/dev/null" if batch_job.depends_on: job_attributes[pbs.ATTR_depend] = self._dependency_string( batch_job) elif self.submit_with_hold: job_attributes[pbs.ATTR_h] = 'u' if batch_job.mail_option: job_attributes[pbs.ATTR_m] = batch_job.mail_option if batch_job.email_list: job_attributes[pbs.ATTR_M] = batch_job.email_list if batch_job.date_time: job_attributes[pbs.ATTR_a] = str( int(time.mktime(batch_job.date_time.timetuple()))) pbs_attrs = pbs.new_attropl( len(job_attributes) + len(job_resources)) # populate pbs_attrs attr_idx = 0 for resource, val in job_resources.iteritems(): pbs_attrs[attr_idx].name = pbs.ATTR_l pbs_attrs[attr_idx].resource = resource pbs_attrs[attr_idx].value = val attr_idx += 1 for attribute, val in job_attributes.iteritems(): pbs_attrs[attr_idx].name = attribute pbs_attrs[attr_idx].value = val attr_idx += 1 # we've initialized pbs_attrs with all the attributes we need to set # now we can connect to the server and submit the job connection = _connect_to_server(self._server) # connected to pbs_server # submit job retry = 0 job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) # if pbs.pbs_submit failed, try again while not job_id and retry < _MAX_RETRY: retry += 1 print("Retrying connection...", file=sys.stderr) time.sleep(retry**2) job_id = pbs.pbs_submit(connection, pbs_attrs, filename, self.queue, None) pbs.pbs_disconnect(connection) # check to see if the job was submitted successfully. if not job_id: e, e_msg = pbs.error() # the batch system returned an error, throw exception raise Exception("Error submitting job. " "Torque error {0}: '{1}'".format( e, torque_strerror(e))) if self.submit_with_hold and not batch_job.depends_on: self.held_jobs.append(job_id) else: #self.submit is False, fake a job ID job_id = "{0}.civet".format(self._id_seq) self._id_seq += 1 self._job_names.append(batch_job.name) self._id_log.write( job_id + '\t' + batch_job.name + '\t' + str(self._printable_dependencies(batch_job.depends_on)) + '\n') self._id_log.flush() return job_id
def submit(self): """Submit the jobscript txt, set self.jobid""" txt = self.script self.log.debug("Going to submit script %s" % txt) # Build default pbs_attributes list pbs_attributes = pbs.new_attropl(1) pbs_attributes[0].name = 'Job_Name' pbs_attributes[0].value = self.name # set resource requirements resourse_attributes = pbs.new_attropl(len(self.resources)) idx = 0 for k, v in self.resources.items(): resourse_attributes[idx].name = 'Resource_List' resourse_attributes[idx].resource = k resourse_attributes[idx].value = v idx += 1 pbs_attributes.extend(resourse_attributes) # add job dependencies to attributes if self.deps: deps_attributes = pbs.new_attropl(1) deps_attributes[0].name = pbs.ATTR_depend deps_attributes[0].value = ",".join(["afterany:%s" % dep for dep in self.deps]) pbs_attributes.extend(deps_attributes) self.log.debug("Job deps attributes: %s" % deps_attributes[0].value) # add a bunch of variables (added by qsub) # also set PBS_O_WORKDIR to os.getcwd() os.environ.setdefault('WORKDIR', os.getcwd()) defvars = ['MAIL', 'HOME', 'PATH', 'SHELL', 'WORKDIR'] pbsvars = ["PBS_O_%s=%s" % (x, os.environ.get(x, 'NOTFOUND_%s' % x)) for x in defvars] # extend PBS variables with specified variables pbsvars.extend(["%s=%s" % (name, value) for (name, value) in self.env_vars.items()]) variable_attributes = pbs.new_attropl(1) variable_attributes[0].name = 'Variable_List' variable_attributes[0].value = ",".join(pbsvars) pbs_attributes.extend(variable_attributes) self.log.debug("Job variable attributes: %s" % variable_attributes[0].value) # mail settings mail_attributes = pbs.new_attropl(1) mail_attributes[0].name = 'Mail_Points' mail_attributes[0].value = 'n' # disable all mail pbs_attributes.extend(mail_attributes) self.log.debug("Job mail attributes: %s" % mail_attributes[0].value) import tempfile fh, scriptfn = tempfile.mkstemp() f = os.fdopen(fh, 'w') self.log.debug("Writing temporary job script to %s" % scriptfn) f.write(txt) f.close() self.log.debug("Going to submit to queue %s" % self.queue) # extend paramater should be 'NULL' because this is required by the python api extend = 'NULL' # job submission sometimes fails without producing an error, e.g. when one of the dependency jobs has already finished # when that occurs, None will be returned by pbs_submit as job id jobid = pbs.pbs_submit(self.pbsconn, pbs_attributes, scriptfn, self.queue, extend) is_error, errormsg = pbs.error() if is_error or jobid is None: self.log.error("Failed to submit job script %s (job id: %s, error %s)" % (scriptfn, jobid, errormsg)) else: self.log.debug("Succesful job submission returned jobid %s" % jobid) self.jobid = jobid os.remove(scriptfn)