def check_all_jobs(self): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server( pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append(pbs_server_name) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: log.debug( "connection to PBS server %s for state check failed" % pbs_server_name) failures.append(pbs_server_name) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob(c, None, stat_attrl, None) pbs.pbs_disconnect(c) statuses.update(self.convert_statjob_to_bunches(jobs)) return ((failures, statuses))
def locate_attribs(id_conn, id_job): code = [] attribs = [] jobs = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") i = 0 index = len(jobs) for job in jobs: print job.name,"\n" attrib = job.attribs y = 0 while i < index: lenght_a = len(attrib) while y < lenght_a: attribs.append(attrib[y].name) code.append(y) y = y + 1 i = i+1 #momentaneo - definire una ADT per memorizzare dianmicamente i codici degli attributi code_len = len(attribs) x = 0 while x < code_len: print attribs[x], "->", code[x], "<-" ,jobs[0].attribs[x].value x = x + 1 return attribs
def check_all_jobs( self ): """ Returns a list of servers that failed to be contacted and a dict of "job_id : status" pairs (where status is a bunchified version of the API's structure. """ servers = [] failures = [] statuses = {} for pbs_job_state in self.watched: pbs_server_name = self.__get_pbs_server(pbs_job_state.job_destination.params) if pbs_server_name not in servers: servers.append( pbs_server_name ) pbs_job_state.check_count += 1 for pbs_server_name in servers: c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) failures.append( pbs_server_name ) continue stat_attrl = pbs.new_attrl(3) stat_attrl[0].name = pbs.ATTR_state stat_attrl[1].name = pbs.ATTR_used stat_attrl[2].name = pbs.ATTR_exitstat jobs = pbs.pbs_statjob( c, None, stat_attrl, None ) pbs.pbs_disconnect( c ) statuses.update( self.convert_statjob_to_bunches( jobs ) ) return( ( failures, statuses ) )
def update_all_jobs(batchserver_name): """ Update info about all jobs of the given batchserver. """ server,created = getBatchServer(batchserver_name) if not pbs_data_jobs.has_key(batchserver_name): pbs_data_jobs[batchserver_name] = {'last_update':None, 'jobs':{}} if pbs_data_jobs[batchserver_name]['last_update'] and (datetime.datetime.now()-pbs_data_jobs[batchserver_name]['last_update']).total_seconds()<GlobalConfiguration.objects.get(pk=1).max_lastupdate: logging.debug("jobs info is new enough for server: %s" % batchserver_name) print "not updated" return pbs_data_jobs print "updated" conn = pbs.pbs_connect(batchserver_name.encode('iso-8859-1', 'replace')) if conn==-1: logging.error("Cannot connect to %s - live data will be missing" % server.name) return statjobs = pbs.pbs_statjob(conn, "" , [], "") pbs.pbs_disconnect(conn) for sj in statjobs: jobid = sj.name attr_dict = dict([ (x.name,x.value) for x in sj.attribs]) attr_dict = {} for x in sj.attribs: if x.resource: attr_dict[x.name+"_"+x.resource] = x.value else: attr_dict[x.name] = x.value pbs_data_jobs[batchserver_name]['jobs'][jobid] = update_one_job_from_pbs_data(jobid, attr_dict) pbs_data_jobs[batchserver_name]['last_update'] = datetime.datetime.now() return pbs_data_jobs
def statjob_detailed(id_conn,id_job): result = [] jobs = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") for job in jobs: print job.name,"\n" for attrib in job.attribs: print attrib.name,attrib.resource,attrib.value return result
def resource_name_forjob(id_conn,id_job,id_resource): result = "" jobs = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") for job in jobs: attrib = job.attribs for i in id_resource: result = attrib[i].resource return result
def get_attribs(id_conn,id_job): attr = {} i = 0 job = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") while i < len(job[0].attribs): attr[str(job[0].attribs[i].name)] = job[0].attribs[i].value i = i + 1 attr['lenght_resources'] = len(job[0].attribs)+1 return attr
def get_status(self): if self.status == main.job.JOB_STATUS.FAIL: return self.status if not self.jobid: return PBS_STATUS['Q'] stats = pbs.pbs_statjob(self.connect(), self.jobid, self.attrl, "NULL") status = stats[0].attribs[0].value self.disconnect() if status in PBS_STATUS: return PBS_STATUS[status] return PBS_STATUS['Q']
def statjob(id_conn,id_job): result = [] jobs = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") nodes = pbs.pbs_statnode(id_conn, "", "NULL", "NULL") for job in jobs: for attrib in job.attribs: for node in nodes: if string.find(attrib.value,node.name) != -1: result.append(node.name) return result
def run_cluster(self, pbs_server, job_script, settings): import pbs from threading import threa self.settings = copy.deepcopy(settings) # Launch script, wait for output to come back, return when it does # Create the job options struct attropl = pbs.new_attropl(4) # Set the name of the job # attropl[0].name = pbs.ATTR_N attropl[0].value = "inferno_" + self.name # Job is Rerunable # attropl[1].name = pbs.ATTR_r attropl[1].value = "y" # Walltime # attropl[2].name = pbs.ATTR_l attropl[2].resource = "walltime" attropl[2].value = "400" # Nodes # attropl[3].name = pbs.ATTR_l attropl[3].resource = "nodes" attropl[3].value = "1:ppn=4" # Run the job if pbs_server == None: pbs_server = pbs.pbs_default() job_id = pbs.pbs_submit(pbs_server, attropl, job_script, "NULL", "NULL") e, e_txt = pbs.error() if e: print e, e_txt # Save the job ID for later so we can check on the status self.job_id = job_id # TODO: Change this # Now loop, checking every 5 seconds or so if the job is done by # polling the pbs_server about the jobid. running = True while running: job_info = pbs.pbs_statjob(pbs_server, self.job_id, "NULL", "NULL") print job_info time.sleep(5)
def _statjob(self, job_name='', attrib_list=None): """Get the job config from the pbs server""" if attrib_list: self._list_2_attrib(attrib_list) else: self.attribs = 'NULL' self._connect() jobs = pbs.pbs_statjob(self.con, job_name, self.attribs, 'NULL') self._disconnect() self._list_2_dict(jobs, job)
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug( "No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error( "Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([(attrib.name, attrib.value) for attrib in j.attribs]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def check_single_job( self, pbs_server_name, job_id ): """ Returns the state of a single job, used to make sure a job is really dead. """ c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: log.debug("connection to PBS server %s for state check failed" % pbs_server_name ) return None stat_attrl = pbs.new_attrl(1) stat_attrl[0].name = pbs.ATTR_state jobs = pbs.pbs_statjob( c, job_id, stat_attrl, None ) pbs.pbs_disconnect( c ) return jobs[0].attribs[0].value
def info(self, jobid, types=None, job_filter=None): """Return jobinfo""" # # TODO restrict to current user jobs if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # # add all filter values to the types if job_filter is None: job_filter = {} self.log.debug("Job filter passed %s" % job_filter) if self.job_filter is not None: self.log.debug("Job filter update with %s" % self.job_filter) job_filter.update(self.job_filter) self.log.debug("Job filter used %s" % job_filter) for filter_name in job_filter.keys(): if not filter_name in types: types.append(filter_name) if types is None: jobattr = 'NULL' else: jobattr = pbs.new_attrl(len(types)) for idx in range(len(types)): jobattr[idx].name = types[idx] jobs = pbs.pbs_statjob(self.pbsconn, jobid, jobattr, 'NULL') if len(jobs) == 0: res = [dict([(typ, None) for typ in types + ['id']])] # add id res = [] # return nothing self.log.debug("No job found. Wrong id %s or job finished? Returning %s" % (jobid, res)) return res elif len(jobs) == 1: self.log.debug( "Request for jobid %s returned one result %s" % (jobid, jobs)) else: self.log.error("Request for jobid %s returned more then one result %s" % (jobid, jobs)) # # more then one, return value res = [] for j in jobs: job_details = dict( [(attrib.name, attrib.value) for attrib in j.attribs]) job_details['id'] = j.name # add id if self.match_filter(job_details, job_filter): res.append(job_details) self.log.debug("Found jobinfo %s" % res) return res
def define_attribs(id_conn, id_job): attribs = [] jobs = pbs.pbs_statjob(id_conn, id_job, "NULL", "NULL") index = len(jobs) i = 0 for job in jobs: y = 0 while i < index: lenght_a = len(job.attribs) while y < lenght_a: attribs.append(job.attribs[y].name) y = y + 1 i = i+1 return attribs
def info(self, types=None): """ Return jobinfo """ if not self.jobid: self.log.debug("no jobid, job is not submitted yet?") return None # convert single type into list if type(types) is str: types = [types] self.log.debug("Return info types %s" % types) # create attribute list to query pbs with if types is None: jobattr = NULL else: jobattr = pbs.new_attrl(len(types)) for idx, attr in enumerate(types): jobattr[idx].name = attr # get a new connection (otherwise this seems to fail) if self.clean_conn: pbs.pbs_disconnect(self.pbsconn) self.pbsconn = pbs.pbs_connect(self.pbs_server) jobs = pbs.pbs_statjob(self.pbsconn, self.jobid, jobattr, NULL) if len(jobs) == 0: # no job found, return None info res = None self.log.debug("No job found. Wrong id %s or job finished? Returning %s" % (self.jobid, res)) return res elif len(jobs) == 1: self.log.debug("Request for jobid %s returned one result %s" % (self.jobid, jobs)) else: self.log.error("Request for jobid %s returned more then one result %s" % (self.jobid, jobs)) # only expect to have a list with one element j = jobs[0] # convert attribs into useable dict job_details = dict([ (attrib.name, attrib.value) for attrib in j.attribs ]) # manually set 'id' attribute job_details['id'] = j.name self.log.debug("Found jobinfo %s" % job_details) return job_details
def query(self, obj, service='', objType='node'): """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task: raise SchedulerError('wrong argument type', str(type(obj))) jobids = [] conn = self.pbs_conn() attrl = pbs.new_attrl(2) attrl[0].name = 'job_state' attrl[1].name = 'exec_host' for job in obj.jobs: if not self.valid(job.runningJob): continue id = str(job.runningJob['schedulerId']).strip() jobstat = pbs.pbs_statjob(conn, id, attrl, 'Null') if not jobstat: err, err_text = pbs.error() if err != 15001: # unknown job (probably finished) self.logging.error('Error in job query for ' + id) self.logging.error('PBS error code ' + str(err) + ': ' + err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err) + ': ' + err_text) host = '' if len(jobstat) == 0: pbs_stat = 'Done' else: pbs_stat = jobstat[0].attribs[0].value if len(jobstat[0].attribs) > 1: host = jobstat[0].attribs[1].value job.runningJob['statusScheduler'] = pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination'] = host self.pbs_disconn(conn)
def update_one_job(conn, jobid, bs): """ Update info about one job """ # TODO: I am a bit afraid this will take a lot of time fulljobid = "%s.%s" % (jobid, bs.name) sj = pbs.pbs_statjob(conn, fulljobid.encode('iso-8859-1', 'replace'), [], "") if len(sj)==0: log(LOG_ERROR, "failed to update info from pbs about job: %s.%s" % (jobid,bs.name)) return None dj = attribs_to_dict(sj[0].attribs) j,created = Job.objects.get_or_create(jobid=jobid, server=bs) if created: log(LOG_INFO, "new job will be created: %s @ %s in queue: %s" % (jobid, bs.name, dj['queue'])) j.job_name = dj['Job_Name'] j.queue = Queue.objects.get(name=dj['queue']) j.job_state = JobState.objects.get(shortname=dj['job_state']) j.save() return j
def query(self, obj, service='', objType='node') : """ query status and eventually other scheduler related information It may use single 'node' scheduler id or bulk id for association """ if type(obj) != Task : raise SchedulerError('wrong argument type', str( type(obj) )) jobids=[] conn=self.pbs_conn() attrl=pbs.new_attrl(2) attrl[0].name='job_state' attrl[1].name='exec_host' for job in obj.jobs : if not self.valid( job.runningJob ): continue id=str(job.runningJob['schedulerId']).strip() jobstat=pbs.pbs_statjob(conn, id, attrl, 'Null') if not jobstat: err, err_text=pbs.error() if err!=15001: # unknown job (probably finished) self.logging.error('Error in job query for '+id) self.logging.error('PBS error code '+str(err)+': '+err_text) self.pbs_disconn(conn) raise SchedulerError('PBS error', str(err)+': '+err_text) host='' if len(jobstat)==0: pbs_stat='Done' else: pbs_stat=jobstat[0].attribs[0].value if len(jobstat[0].attribs)>1: host=jobstat[0].attribs[1].value job.runningJob['statusScheduler']=pbs_stat job.runningJob['status'] = self.status_map[pbs_stat] job.runningJob['destination']=host self.pbs_disconn(conn)
def info(self, jobid, types=None, job_filter=None): """Return jobinfo""" # add all filter values to the types if job_filter is None: job_filter = {} self.log.debug("Job filter passed %s", job_filter) if self.job_filter is not None: self.log.debug("Job filter update with %s", self.job_filter) job_filter.update(self.job_filter) self.log.debug("Job filter used %s", job_filter) for filter_name in job_filter.keys(): if not filter_name in types: types.append(filter_name) if types is None: jobattr = 'NULL' else: jobattr = pbs.new_attrl(len(types)) for idx, name in enumerate(types): jobattr[idx].name = name jobs = pbs.pbs_statjob(self.pbsconn, jobid, jobattr, 'NULL') if not jobs: self.log.debug("No job found. Wrong id %s or job finished?", jobid) return [] self.log.debug("Request for jobid %s returned %d result(s) %s", jobid, len(jobs), jobs) res = [] for j in jobs: job_details = dict([(attrib.name, attrib.value) for attrib in j.attribs]) job_details['id'] = j.name # add id if self.match_filter(job_details, job_filter): res.append(job_details) self.log.debug("Found jobinfo %s", res) return res
sys.path.append(os.path.abspath("./")) import pbs except: # Running from within the tests directory. sys.path.append(os.path.abspath("../")) import pbs # You need to set the hostname of the PBS Server. pbsserver = 'hpcnode0' conn = pbs.pbs_connect(pbsserver) if conn < 0: print('Error connecting to server.') sys.exit(1) # Returns a batch_status structure. b = pbs.pbs_statjob(conn, '', None, None) while b != None: print("\n------ Job: %s ------" % b.name) attribs = b.attribs while attribs != None: if attribs.resource != None: print(" %s.%s = %s" % (attribs.name, attribs.resource, attribs.value)) else: print(" %s = %s" % (attribs.name, attribs.value)) attribs = attribs.next b = b.next
def get_jobs(conn, extend=None): ''' Get information on the PBS jobs. This function returns a list of jobs, where each job is a dictionary. This is the list of resources requested by the job, e.g.: Resource_List : mem = 120gb Resource_List : ncpus = 24 Resource_List : nodect = 1 Resource_List : place = free Resource_List : select = 1:ncpus=24:mem=120GB Resource_List : walltime = 200:00:00 These are non-resource attributes, e.g. Job_Name : AuCuZn Job_Owner : 999777@hpcnode0 job_state : Q queue : workq server : hpcnode0 etc .... ''' jobs = [] # This will contain a list of dictionaries. # Some jobs don't yet have a particular attribute as the job hasn't started yet. # We have to create that key and set it to something, otherwise we get errors like: # NameError("name 'resources_used_ncpus' is not defined",) attribute_names = ['resources_used_ncpus', 'resources_used_mem', 'resources_used_vmem', \ 'resources_used_walltime', 'exec_host', 'exec_vnode', 'stime', 'etime', 'resources_time_left', \ 'resources_used_cpupercent'] b = pbs.pbs_statjob(conn, '', None, extend) while b != None: attributes = {} # Init the dictionary to empty. # Init the values of the attributes. for name in attribute_names: attributes[name] = '' for name in ['resources_used_walltime', 'resources_used_cput']: attributes[name] = '0:0:0' attribs = b.attribs #print('-----------', b.name, '-------------------') attributes['job_id'] = b.name.split('.')[ 0] # b.name is a string like '137550.hpcnode0' while attribs != None: if attribs.resource != None: #print(' ', attribs.name, ':', attribs.resource, '=', attribs.value) keyname = '%s_%s' % (attribs.name, attribs.resource) keyname = keyname.lower() attributes[keyname] = attribs.value else: #print(' ', attribs.name, ':', attribs.value) keyname = attribs.name.lower() attributes[keyname] = attribs.value attribs = attribs.next jobs.append(attributes) b = b.next return jobs
#print combine[1].name #print combine[2].name nodes = pbs.pbs_statnode(con, "", 'NULL', "NULL") for node in nodes: print node.name, ':' for prop in node.attribs: print '\t', prop.name, ' = ', prop.value queues = pbs.pbs_statque(con, "", 'NULL', "") for queue in queues: print queue.name for attrib in queue.attribs: print '\t', attrib.name, ' = ', attrib.value jobs = pbs.pbs_statjob(con, "", 'NULL', "") for job in jobs: print job.name for attrib in job.attribs: print '\t', attrib.name, ' = ', attrib.value sys.exit(0) ## OLD stuff obselete ## while batch_info.this: node_attr = batch_info.attribs print batch_info.name, ':' while node_attr.this: # print node_attr.this print '\t', node_attr.name ,node_attr.value
def _getJobStatus(self, jobid): status = pbs.pbs_statjob(self._connection_id, jobid, "NULL", "NULL") job_state = status[0].attribs for attr in job_state: if attr.name == "job_state": return attr.value