def is_it_exclusive(job): """ check to see if the job requested exclusive, or if the nodes are marked exclusive. This needs to be passed to ATOM. """ place = str(job.Resource_List["place"]) log_with_caller(pbs.EVENT_DEBUG4, "place is %s" % place) # See if the node sharing value has exclusive vn = pbs.server().vnode(pbs.get_local_nodename()) sharing = vn.sharing log_with_caller(pbs.EVENT_DEBUG4, "The sharing value is %s type %s" % (str(sharing), str(type(sharing)))) # Uses the same logic as the scheduler (is_excl()) if sharing == pbs.ND_FORCE_EXCL or sharing == pbs.ND_FORCE_EXCLHOST: return True if sharing == pbs.ND_IGNORE_EXCL: return False if any(s.startswith('excl') for s in place.split(':')): return True if any(s.startswith('shared') for s in place.split(':')): return False if (sharing == pbs.ND_DEFAULT_EXCL or sharing == pbs.ND_DEFAULT_EXCLHOST): return True if sharing == pbs.ND_DEFAULT_SHARED: return False return False
def check_pbs(): vnodes = pbs.server().vnodes() free = 0 for v in vnodes: if v.resources_available["vntype"] and \ v.resources_available["vntype"] != "cray_login" and \ ((v.state == pbs.ND_FREE) or (v.state == pbs.ND_JOB_EXCLUSIVE) or (v.state == pbs.ND_RESV_EXCLUSIVE)): free += 1 return free
def _svr_vnode(name): # Return a vnode object obtained from the server by name. # Save the values in a global dictionary for future use. global pmi_pbsconf if "pmi_pbsvnodes" not in globals(): global pmi_pbsvnodes pmi_pbsvnodes = dict() for vn in pbs.server().vnodes(): pmi_pbsvnodes[vn.name] = vn return pmi_pbsvnodes[name]
def getJobs(): job_lst = [] jobs = {} s = pbs.server() if s.vnode(local_node).jobs: job_lst = s.vnode(local_node).jobs.split(',') for job in job_lst: job = re.sub('/\d+', '', job) job = job.replace(" ", "") jobs[job] = 1 return jobs
def __init__(self, e): self.rc = -1 self.e = e self.parse_cfg() self.nodename = pbs.get_local_nodename() try: self.node = pbs.server().vnode(self.nodename) except: pbs.logmsg(pbs.EVENT_DEBUG, "Health-check hook; failed to get node info from server") self.e.reject() self.vnl = self.e.vnode_list
def get_local_node(name): # Get host names from /etc/hosts and return matching name for the MoM try: (hostname, aliaslist, _) = socket.gethostbyname_ex(name) except: return None aliaslist.append(hostname) # Search for possible match in server vnode list. pbsvnodes = dict() for vn in pbs.server().vnodes(): pbsvnodes[vn.name] = vn for n in aliaslist: if n in pbsvnodes: return pbsvnodes[n] return None
if a nastran job failed, it only keeps the solver deck 'bdf' file , 'f04' and 'f06' for details, and remove other temp file in the solver to save time default order of this hook is 1 if you want to keep more files with failed nastran jobs, please add extra file extentions to reserve_ext please follow those step to take effect qmgr -c " c h nastran_ep" qmgr -c " s h nastran_ep event=execjob_epilogue" qmgr -c " i h nastran_ep application/x-python default nastran_ep.py" ''' import os import sys import pbs j=pbs.event().job j_query=pbs.server().job(j.id); soft=j_query.Resource_List["software"].__str__(); if not soft == 'Nastran' : sys.exit(0); reserve_ext=['f04','f06','bdf']; j_id=j.id.split(".")[0]; reserve_ext.append("o"+j_id); reserve_ext.append("e"+j_id);
return None # Main pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: START") # Determine the minute of the time. now = time.strftime("%M", time.gmtime()) if ADDITIONAL_DEBUG: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, the minute right now = %s" % (now)) # Identify the cray_login nodes, which are running the pbs_mom and put them # into a list. cray_login = [] vnodes = pbs.server().vnodes() for v in vnodes: if v.resources_available["vntype"] and \ v.resources_available["vntype"] == "cray_login" and \ ((v.state == pbs.ND_FREE) or (v.state == pbs.ND_JOB_EXCLUSIVE) or (v.state == pbs.ND_RESV_EXCLUSIVE)): cray_login.append(str(v)) if ADDITIONAL_DEBUG: for cl in cray_login: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: ADDITIONAL DEBUG, Eligible Cray Login Nodes = %s" % (cl)) # Determine the total number of cray_login nodes cray_login_total = len(cray_login) if cray_login_total > 0: pbs.logmsg(pbs.LOG_DEBUG, "PBS/ALP Inventory Check: Total Eligible Cray Login Nodes = %s" % (cray_login_total)) else:
return False # MAIN LOGIC BEGIN try: queue_type = extract_queue_type() walltime = extract_walltime() selection = extract_selection() if queue_type == "private": if pbs.event().job.project: pbs.event().reject( "Express project codes can not be used with private queues") queue = pbs.server().queue(pbs.event().job.queue.name) if not queue: pbs.event().reject("Invalid queue name") permitted_groups = queue.resources_available['permitted_groups'] if permitted_groups: permitted_groups = permitted_groups.split(',') if not test_group_membership(permitted_groups): pbs.event().reject( "You are not authorised to use this private queue") fixup_mpiprocs_ompthreads(selection) # PQs are small and offline resources may preent jobs running # Prevent them becoming topjobs # Exclude med-bio, they can be top jobs, this lets largemem jobs compete against tiny ones if queue_name() != "med-bio":
if (e.job.queue.name in ["adis_in", "power_0qa1"]): if (e.job.Variable_List["in_queue_credential"] != "1"): e.reject( "Error, Make sure you use --> qsub_in <-- command to submit to the queue!" ) power_0qa1_nodes_list = [ 'compute-0-166.power5', 'compute-0-167.power5' ] for qnode in power_0qa1_nodes_list: node = pbs.server().vnode(qnode) node_ncpu_total = node.resources_available["ncpus"] node_ncpu_used = node.resources_assigned["ncpus"] node_ncpu_free = node_ncpu_total - node_ncpu_used job_ncpus = e.job.Resource_List["ncpus"] if (job_ncpus > node_ncpu_free): pass else: e.accept() break adisq = pbs.server().queue("adis") e.job.queue = adisq
import pbs import sys #qmgr -c 'create hook check_and_route_adis_0q event="queuejob"';qmgr -c 's h check_and_route_adis_0q debug=True' #qmgr -c 'import hook check_and_route_adis_0q application/x-python default /var/spool/PBS/hooks/check_and_route_adis_0q.py' #qmgr -c 'd h check_and_route_adis_0q' try: e = pbs.event() if e.job.queue: target_qname = e.job.queue.name if (target_qname in ["adis_0q"]): adisq = pbs.server().queue("adis") adis_used_ncpus = adisq.resources_assigned["ncpus"] if (adis_used_ncpus >= 2): e.job.queue = adisq # accept the event e.accept() except: # e.reject("Failed to route job to queue adis_0q") e.accept("Failed to route job to queue adis_0q")
import pbs import sys try: # Get the hook event information and parameters # This will be for the 'queuejob' event type. e = pbs.event() # Get the information for the job being queued j = e.job if j.interactive: # Get the “interQ” queue object q = pbs.server().queue("live_q") # Reset the job's destination queue # parameter for this event j.queue = q # accept the event e.accept() except SystemExit: pass except: e.reject("Failed to route job to queue live_q")
vnode = e.vnode aoe = e.aoe pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: Env = %s" % repr(os.environ)) pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: PBS Node = %s" % vnode) pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: AOE = %s" % aoe) # Provision hook will run on PBS Server but provisioning is started from Admin node, both may not run on same node. # Check for admin node? Read from json config file. if 'PBS_HOOK_CONFIG_FILE' in os.environ: import json config_file = os.environ["PBS_HOOK_CONFIG_FILE"] #pbs.logmsg(pbs.EVENT_DEBUG, "%s: Config file is %s" % (caller_name(), config_file)) config = json.load(open(config_file, 'r'), object_hook=decode_dict) server = pbs.server().name admin = config['admin-node'] pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: server name = %s" % server) pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: admin node = %s" % admin) if admin == server: ret = os.system( "/opt/clmgr/contrib/hpcm_pbspro_connector/bin/hpcm_provision.sh " + aoe + " " + vnode) if ret != 0: pbs.logmsg(pbs.LOG_DEBUG, "PROVISIONING: Failed - retcode = %s" % str(ret)) e.reject("Reboot provisioning failed", ret) else: e.accept(0)
sys.path += [ '', '/opt/pbs/default/python/lib/python25.zip', '/opt/pbs/default/python/lib/python2.5', '/opt/pbs/default/python/lib/python2.5/plat-linux2', '/opt/pbs/default/python/lib/python2.5/lib-tk', '/opt/pbs/default/python/lib/python2.5/lib-dynload', '/opt/pbs/default/python/lib/python2.5/site-packages' ] import subprocess import re e = pbs.event() j = e.job path = pbs.server().resources_available['store_path'] os.system('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.OU' + ' ' + path) os.system('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.ER' + ' ' + path) f = open("/tmp/hooks", "w") #for i in dir(pbs.server()) : # f.write("%s\n" % i); f.write(repr(pbs.server().resources_available['store_path'])) #f.write(j.Error_Path) #f.write(j.Output_Path) f.write('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.OU' + ' ' + path) f.write('/usr/bin/scp ' + '/var/spool/PBS/spool/' + j.id + '.ER' + ' ' + path)
"You can only request one chunk of the form #PBS -l select=N:ncpus=X:mem=Ygb:mpiprocs=Z:ompthreads=W" ) for chunk in chunks: nodect += int(chunk.split(":")[0]) for rs in chunk.split(":")[1:]: kw = rs.split("=")[0] if (kw not in list_of_resources): pbs.event().reject( "Select statements can only contain the resources: " + ", ".join(list_of_resources)) if (kw == "ncpus"): ncpus = int(rs.split("=")[1]) matched = False if nodect <= 18 and ncpus == 24 and walltime <= pbs.duration("2:0:0"): pbs.event().job.queue = pbs.server().queue("short") matched = True if not matched and nodect >= 2 and nodect <= 18 and ncpus == 16 and walltime <= pbs.duration( "72:0:0"): pbs.event().job.queue = pbs.server().queue("general") matched = True if not matched and nodect >= 72 and nodect <= 270 and ncpus in [ 28 ] and walltime <= pbs.duration("24:0:0"): pbs.event().job.queue = pbs.server().queue("capability") matched = True if not matched and nodect >= 18 and nodect <= 72 and ncpus in [ 24, 28 ] and walltime <= pbs.duration("48:0:0"): pbs.event().job.queue = pbs.server().queue("large") matched = True
# To instantiate this hook, specify the following: # qmgr -c "create hook load_balance event=exechost_periodic,freq=10" # qmgr -c "import hook load_balance application/x-python default load_balance.py" import pbs import os import re ideal_load = 1.5 max_load = 2.0 # get_la: returns a list of load averages within the past 1-minute, 5-minute, # 15-minutes range. def get_la(): line = os.popen("uptime").read() r = re.search(r'load average: (\S+), (\S+), (\S+)$', line).groups() return map(float, r) local_node = pbs.get_local_nodename() vnl = pbs.event().vnode_list current_state = pbs.server().vnode(local_node).state mla = get_la()[0] if (mla >= max_load) and ((current_state & pbs.ND_OFFLINE) == 0): vnl[local_node].state = pbs.ND_OFFLINE vnl[local_node].comment = "offlined node as it is heavily loaded" elif (mla < ideal_load) and ((current_state & pbs.ND_OFFLINE) != 0): vnl[local_node].state = pbs.ND_FREE vnl[local_node].comment = None
import pbs import sys #qmgr -c 'create hook check_fair_share_q event="queuejob"';qmgr -c 's h check_fair_share_q debug=True' #qmgr -c 'import hook check_fair_share_q application/x-python default /var/spool/PBS/hooks/check_fair_share_q.py' #qmgr -c 'd h check_fair_share_q' try: e = pbs.event() if e.job.queue: target_qname = e.job.queue.name if (target_qname in ["fair_test"]): pbs.server().job() e.accept() # adisq = pbs.server().queue("adis") # adis_used_ncpus = adisq.resources_assigned["ncpus"] # if (adis_used_ncpus >= 2): # e.job.queue = adisq # accept the event except: # e.reject("Failed to route job to queue adis_0q") e.accept()
# Main sys.path.append(PBS_EXEC + '/python/lib/python2.7') sys.path.append(PBS_EXEC + '/python/lib/python2.7/lib-dynload') from subprocess import Popen, PIPE from sets import Set try: e = pbs.event() j = e.job # Get the username who = str(e.requestor) # Get queue if j.queue == '': q = pbs.server().default_queue.name else: q = j.queue.name # Get permitted_groups or accept the job permitted_groups = pbs.server().queue(q).resources_available['permitted_groups'] if permitted_groups == None: e.accept() else: permitted_groups = permitted_groups.split(',') # Build a list of users from all permitted groups users = Set([]) try: for g in permitted_groups:
### resource. if ( 'PBS_GET_IBWINS' in myjob.Variable_List ): pbs.logmsg(pbs.LOG_WARNING, "User requested that netwins be calculated") else: sys.exit() ### script Defaults area ### Reasonable internal defaults for networks, instances, and max instances default_networks = 2 max_instances = 4 instances = 1 msg_api = "" mpiprocs=1 # Put this closer to actual reference for efficiency. mysrv = pbs.server() debug_me = False # If resources_available.debug_hooks contains the name of this hook, then we # turn on the debug flag. if ( "debug_hooks" in mysrv.resources_available and my_name in str( mysrv.resources_available['debug_hooks']).split(',') ): debug_me=True def dbg_svr_log(string): '''quick function to wrap debug logging to the server''' # Abort if the hook_debug value is not set if(debug_me):
resources[node_i]['ncpus'] = 0 resources[node_i]['ncpus'] += int(m.group(1)) return resources try: e = pbs.event() if e.type == pbs.RUNJOB: j = e.job if j.queue and re.match('^[RM]{1}[0-9]+', j.queue.name): e.accept() resources = parse_exec_vnode(j.exec_vnode) for nodename in resources.keys(): node = pbs.server().vnode(nodename) available_ncpus = node.resources_available['ncpus'] assigned_ncpus = node.resources_assigned['ncpus'] try: requested_ncpus = resources[nodename]['ncpus'] except: requested_ncpus = 1 if assigned_ncpus + requested_ncpus > available_ncpus: now = time.strftime("%Y%m%d%H%M%S", time.localtime()) jobs = [] filename = "/tmp/pbs_overcommit_detector_%s_%s_overcommit_by_%s" % (now, nodename, j.id)
# specified in 'high_priority_queue', and also tells the server to restart # the scheduling cycle. This for faster qsub -Is throughput. # # Prerequisite: # Site must define a "high" queue as follows: # qmgr -c "create queue high queue_type=e,Priority=150 # qenable high # qstart high # NOTE: # A) 150 is the default priority for an express (high) queue. # This will have the interactive job to preempt currently running # work. # B) If site does not want this, lower the priority of the high # priority queue. This might not cause the job to run right away, # but will try. # # This hook is instantiated as follows: # qmgr -c "create hook rapid event=queuejob" # qmgr -c "import hook rapid_inter application/x-python default rapid_inter.py" import pbs high_priority_queue = "high" e = pbs.event() if e.job.interactive: high = pbs.server().queue(high_priority_queue) if high != None: e.job.queue = high pbs.logmsg(pbs.LOG_DEBUG, "quick start interactive job") pbs.server().scheduler_restart_cycle()
mynodename = pbs.get_local_nodename() pbs.logmsg(pbs.EVENT_DEBUG3, "got node: %s" % (mynodename)) myvnode.state = pbs.ND_FREE pbs.logmsg(pbs.EVENT_DEBUG, "Changed node state to ND_FREE: %s" % (mynodename)) myvnode.comment = None pbs.logmsg(pbs.EVENT_DEBUG, "Onlined node: %s" % (mynodename)) else: return True if __name__ == "__builtin__": start = time.time() pbs.logmsg(pbs.EVENT_DEBUG3, "Starting the node health check") c = NodeHealthCheck() if pbs.event().type == pbs.EXECHOST_PERIODIC: vnode = pbs.server().vnode(c.host) if vnode.state == pbs.ND_OFFLINE and vnode.comment.startswith( '-attn_nhc:'): # Still need to flesh out CheckOfflineNode function c.CheckOfflineNode() else: c.CheckNodePeriodic() else: c.CheckNode() pbs.logmsg(pbs.EVENT_DEBUG3, "Finished check disk hook: %0.5lf (s)" % (time.time() - start))
e = pbs.event() if e.job.queue: if (e.job.queue.name in ["adis_in","power_0qa1"]): if (e.job.Variable_List["in_queue_credential"] != "1"): e.reject("Error, Make sure you use --> qsub_in <-- command to submit to the queue!") power_0qa1_nodes_list = ['compute-0-166.power5','compute-0-167.power5'] for qnode in power_0qa1_nodes_list: node=pbs.server().vnode(qnode) node_ncpu_total = node.resources_available["ncpus"] node_ncpu_used = node.resources_assigned["ncpus"] node_ncpu_free = node_ncpu_total - node_ncpu_used job_ncpus = e.job.Resource_List["ncpus"] if ( job_ncpus > node_ncpu_free ): pass else: e.accept() break adisq = pbs.server().queue("adis") e.job.queue = adisq
if pbs.event().job.project: pbs.event().reject( "Express project codes can not be used with reservations" ) pbs.event().accept() # private queues # check group membership # fix ompthreads/mpiprocs # check that selection is valid if queue_type == "private": if pbs.event().job.project: pbs.event().reject( "Express project codes can not be used with private queues" ) pbs.event().accept() queue = pbs.server().queue(pbs.event().job.queue.name) if not queue: pbs.event().reject("Invalid queue name") permitted_groups = queue.resources_available["permitted_groups"] if permitted_groups: permitted_groups = permitted_groups.split(",") if not test_group_membership( permitted_groups ): pbs.event().reject("You are not authorised to use this private queue") fixup_mpiprocs_ompthreads( selection ) # check_pq_restriction( selection, walltime, pbs.event().job.queue.name ) pbs.event().accept() # Express version 0 - accept anything provided the user is in an exp-XXX group express = False
# # To instantiate this hook, specify the following: # qmgr -c "create hook load_balance event=exechost_periodic,freq=10" # qmgr -c "import hook load_balance application/x-python default load_balance.py" import pbs import os import re ideal_load=1.5 max_load=2.0 # get_la: returns a list of load averages within the past 1-minute, 5-minute, # 15-minutes range. def get_la(): line=os.popen("uptime").read() r = re.search(r'load average: (\S+), (\S+), (\S+)$', line).groups() return map(float, r) local_node = pbs.get_local_nodename() vnl = pbs.event().vnode_list current_state = pbs.server().vnode(local_node).state mla = get_la()[0] if (mla >= max_load) and ((current_state & pbs.ND_OFFLINE) == 0): vnl[local_node].state = pbs.ND_OFFLINE vnl[local_node].comment = "offlined node as it is heavily loaded" elif (mla < ideal_load) and ((current_state & pbs.ND_OFFLINE) != 0): vnl[local_node].state = pbs.ND_FREE vnl[local_node].comment = None
pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(self.host)) mynodename = pbs.get_local_nodename() myvnode = pbs.event().vnode_list[mynodename] mynodename = pbs.get_local_nodename() pbs.logmsg(pbs.EVENT_DEBUG3,"got node: %s"%(mynodename)) myvnode.state = pbs.ND_FREE pbs.logmsg(pbs.EVENT_DEBUG,"Changed node state to ND_FREE: %s"%(mynodename)) myvnode.comment = None pbs.logmsg(pbs.EVENT_DEBUG,"Onlined node: %s"%(mynodename)) else: return True if __name__ == "__builtin__": start = time.time() pbs.logmsg(pbs.EVENT_DEBUG3,"Starting the node health check") c = NodeHealthCheck() if pbs.event().type == pbs.EXECHOST_PERIODIC: vnode = pbs.server().vnode(c.host) if vnode.state == pbs.ND_OFFLINE and vnode.comment.startswith('-attn_nhc:'): # Still need to flesh out CheckOfflineNode function c.CheckOfflineNode() else: c.CheckNodePeriodic() else: c.CheckNode() pbs.logmsg(pbs.EVENT_DEBUG3,"Finished check disk hook: %0.5lf (s)"%(time.time()-start))
# the scheduling cycle. This for faster qsub -Is throughput. # # Prerequisite: # Site must define a "high" queue as follows: # qmgr -c "create queue high queue_type=e,Priority=150 # qenable high # qstart high # NOTE: # A) 150 is the default priority for an express (high) queue. # This will have the interactive job to preempt currently running # work. # B) If site does not want this, lower the priority of the high # priority queue. This might not cause the job to run right away, # but will try. # # This hook is instantiated as follows: # qmgr -c "create hook rapid event=queuejob" # qmgr -c "import hook rapid_inter application/x-python default rapid_inter.py" import pbs high_priority_queue="high" e = pbs.event() if e.job.interactive: high = pbs.server().queue(high_priority_queue) if high != None: e.job.queue = high pbs.logmsg(pbs.LOG_DEBUG, "quick start interactive job") pbs.server().scheduler_restart_cycle()
now_mn = int(time.strftime("%M", time.gmtime())) msg = [] if not os.path.isfile(XTHOSTNAME): msg += ["No %s file found on this host." % (XTHOSTNAME)] __exit_hook(0, msg) # XTHOSTNAME file found on this host. Read it to determine our Cray hostname. with open(XTHOSTNAME) as xthost_file: my_crayhost = xthost_file.readline() my_crayhost = my_crayhost.rstrip() msg += ["Processing ALPS inventory for crayhost %s" % (my_crayhost)] start = time.time() vnodes = pbs.server().vnodes() vnodes_query_duration = time.time() - start if not vnodes: msg += ["ALPS Inventory Check: No vnodes reported by PBS"] __exit_hook(1, msg) down_states = pbs.ND_DOWN | pbs.ND_STALE | pbs.ND_STATE_UNKNOWN for v in vnodes: str_v = str(v) vntype = " " if (v.state & down_states) or \ "PBScrayhost" not in v.resources_available or \ v.resources_available["PBScrayhost"] != my_crayhost: continue
import pbs import os import re import sys e = pbs.event() try: if e.job.in_ms_mom(): exit_code = str(e.job.Exit_status) local_node = pbs.get_local_nodename() vnl = e.vnode_list current_state = pbs.server().vnode(local_node).state if (int(exit_code) != 0) and ((current_state == pbs.ND_OFFLINE) == 0): vnl[local_node].state = pbs.ND_OFFLINE vnl[local_node].comment = "offlined node as it is heavily loaded" report_file1 = str("/home/centos/pqr.txt") pbs.logmsg(pbs.LOG_DEBUG, "report_usage file1 is %s" % report_file1) fd_out1 = open(report_file1, 'w+') print >> fd_out1, 'To: [email protected]' print >> fd_out1, 'From: [email protected]' print >> fd_out1, 'Subject: Node Taken offline' print >> fd_out1, 'Node Name: = ' + pbs.server().vnode(local_node).name fd_out1.close() mail_cmd="/usr/sbin/sendmail -t \"PBS OSS\" < /home/centos/pqr.txt" pbs.logmsg(pbs.LOG_DEBUG, "mail_command is %s" % mail_cmd) os.popen(mail_cmd)