def get_fs(s_cmd, debug=False): '''run cluster-faireshare to get current fairshare per USER/GROUP/CLASS/QoS''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) fs = {} s = Shell(debug=False) rc, output, m = s.cmd1(s_cmd, allowed_exit=[0,255], timeout=300) if rc != 0: logger.error('command %s return non-exit code: %d' % (cmd, rc)) else: re_user_beg = re.compile('^USER$') re_group_beg = re.compile('^GROUP$') re_class_beg = re.compile('^CLASS$') idx = {'user':[],'group':[],'class':[]} k = None output = output.split('\n') for i in xrange(len(output)): if re_user_beg.match(output[i].strip()): k = 'user' idx[k].append(i+2) continue if re_group_beg.match(output[i].strip()): k = 'group' idx[k].append(i+2) continue if re_group_beg.match(output[i].strip()): k = 'class' idx[k].append(i+2) continue if k and output[i].strip() == '': idx[k].append(i) k = None continue logger.debug('line indices on %s output for USER/GROUP/CLASS' % s_cmd ) logger.debug(' |_ %s ' % repr(idx)) re_skip = re.compile('^DEFAULT') for k,v in idx.iteritems(): fs[k] = {} if v: for line in output[v[0]:v[1]]: data = line.strip().split() if not re_skip.match( data[0] ): ## remove the '*' at the tail of userid fs[k][re.sub('\*$','',data[0])] = float(data[1]) return fs
def get_fs(s_cmd, debug=False): '''run cluster-faireshare to get current fairshare per USER/GROUP/CLASS/QoS''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) fs = {} s = Shell(debug=False) rc, output, m = s.cmd1(s_cmd, allowed_exit=[0, 255], timeout=300) if rc != 0: logger.error('command %s return non-exit code: %d' % (cmd, rc)) else: re_user_beg = re.compile('^USER$') re_group_beg = re.compile('^GROUP$') re_class_beg = re.compile('^CLASS$') idx = {'user': [], 'group': [], 'class': []} k = None output = output.split('\n') for i in xrange(len(output)): if re_user_beg.match(output[i].strip()): k = 'user' idx[k].append(i + 2) continue if re_group_beg.match(output[i].strip()): k = 'group' idx[k].append(i + 2) continue if re_group_beg.match(output[i].strip()): k = 'class' idx[k].append(i + 2) continue if k and output[i].strip() == '': idx[k].append(i) k = None continue logger.debug('line indices on %s output for USER/GROUP/CLASS' % s_cmd) logger.debug(' |_ %s ' % repr(idx)) re_skip = re.compile('^DEFAULT') for k, v in idx.iteritems(): fs[k] = {} if v: for line in output[v[0]:v[1]]: data = line.strip().split() if not re_skip.match(data[0]): ## remove the '*' at the tail of userid fs[k][re.sub('\*$', '', data[0])] = float(data[1]) return fs
from threading import Thread from GangaThreadPool import GangaThreadPool from Common import getMyLogger logger = getMyLogger('GangaThread') class GangaThread(Thread): def __init__(self, name, auto_register=True, critical=True, **kwds): name = 'GANGA_Update_Thread_%s' % name Thread.__init__(self, name=name, **kwds) self.setDaemon(True) self.__should_stop_flag = False self.__critical = critical if auto_register: tpool = GangaThreadPool.getInstance() tpool.addServiceThread(self) def isCritical(self): """Return critical flag. @return: Boolean critical flag. """ return self.__critical def setCritical(self, critical): """Set critical flag, which can be used for example in shutdown algorithms. See Ganga/Core/__init__.py for example.
def get_complete_jobs(logdir, date, debug=False): '''gets all completed jobs on the given date expressed in format of %Y%m%d (i.e. 20140130)''' def __convert_memory__(mymem): '''check if memory type is specified else default to mb''' gb_mem = None scale = {'b': 1024**3, 'kb': 1024**2, 'mb': 1024, 'gb': 1} re_mem = re.compile('^([0-9]*)([m,k,g]{0,1}b{0,1})$') m = re_mem.match(mymem) if m: size = float(m.group(1)) unit = m.group(2) if not unit: unit = 'b' gb_mem = size / scale[unit] return gb_mem def __readfix_xml__(myfile): '''read and parse the torque log (XML) file''' import xmltodict # open xml file and do some fixing temp = open(myfile, 'r').read() # fix incorrect closing tag #temp = temp.replace('JobId', 'Job_Id') temp = re.sub(r'<Variable_List>.*</Variable_List>', '', temp.replace('JobId', 'Job_Id')) # fix the fact that there is no overarching beginning and end tag. temp = '<data>\n' + temp + '\n</data>' # read xml string xmldoc = xmltodict.parse(temp) # return list containing subdict for each job cjobs = xmldoc['data']['Jobinfo'] # if there is only one JobInfo block, the data is not put into a list # to make it consistent, we put data into a list if not isinstance(cjobs, list): cjobs = [cjobs] return cjobs ## get list of XML files corresponding to the jobs from the given date xmlfiles = glob.glob(os.path.join(logdir, date) + '*') jlist = [] logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) for f in xmlfiles: logger.debug('parsing logfile: %s' % f) cjobs = __readfix_xml__(f) for j in cjobs: o = Job( jid=j['Job_Id'], # torque job id jname=None, # torque job name jstat=None, # torque job status jec=None, # job exit code cstat='unknown', # category status interpreted from jec uid=None, # job owner gid=None, # job owner's group id queue=None, # job queue rmem=0, # requested memory in byte rwtime=0, # requested wall-clock time in second htypes=None, # the Job's Hold_Types jpath=None, # the Job's Join_Path cmem=None, # consumed physical memory in byte cvmem=None, # consumed virtual memory in byte cwtime=None, # consumed wall-clock time in second cctime=None, # consumed CPU time in second node=None, # compute node host t_submit=None, # timestamp for job being submitted to Torque t_queue=None, # timestamp for job being scheduled in the queue t_start= None, # timestamp for job being started on execution node t_finish=None # timestamp for job being completed ) ## handles the retried jobs (seperate entry in log file with same job id) is_newjob = True try: o = jlist[jlist.index(o)] is_newjob = False logger.warning('job already presented in list: %s' % o.jid) except: pass ## attributes may not be available ## - resource requirement try: o.jname = j['Job_Name'] except KeyError, e: logger.warning('cannot find "Job_Name" for job %s' % o.jid) ## - resource requirement try: o.rmem = __convert_memory__(j['Resource_List']['mem']) o.rwtime = int(j['Resource_List']['walltime']) except KeyError, e: logger.warning('cannot find "Resource_List" for job %s' % o.jid) except TypeError, e: logger.warning('empty "Resource_List" for job %s' % o.jid)
def get_qstat_jobs(s_cmd, node_domain_suffix='dccn.nl', debug=False): '''run cluster-qstat to get all job status and convert the output into job info dictionary''' print s_cmd def __proc_walltime__(mytime): '''convert walltime to sum of minutes''' minutes = int(mytime.split(':')[0]) * 60 + int(mytime.split(':')[1]) return minutes logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) jlist = {} s = Shell(debug=False) rc, output, m = s.cmd1(s_cmd, allowed_exit=[0, 255], timeout=300) re_jinfo = re.compile( '^([0-9\[\]]+)\.\S+\s+' + # job id '(\w+)\s+' + # user id '(\w+)\s+' + # queue name '(\S+)\s+' + # job name '([-,\d]+)\s+' + # session id '([-,\d]+)\s+' + # NDS '([-,\d,\S+]+)\s+' + # TSK '(\d+)gb\s+' + # requested physical memory '(\d+:\d+:\d+)\s+' + # requested wall-clock time '([A-Z]+)\s+' + # job status (expecting Q,R,C,H,E only) '(\d+:\d+:\d+|-{2,})\s+' + # wall-clock time consumption '(.*)$') # computer node and session #'((((dccn-c\d+)/(\d+\+?))|-{2,}){1,})$' ) # computer node and session if rc != 0: logger.error('command %s return non-exit code: %d' % (s_cmd, rc)) else: def __apply_domain_suffix__(node_hostname): host = node_hostname.split('/')[0] if host != '--': host += '.' + node_domain_suffix return host for l in output.split('\n'): l = l.strip() m = re_jinfo.match(l) if m: nodelist = [] if m.group(12) != '--': nodelist = map(lambda x: __apply_domain_suffix__(x), m.group(12).split('+')) j = Job(jid=m.group(1), uid=m.group(2), queue=m.group(3), jname=m.group(4), sid=m.group(5), nds=m.group(6), tsk=m.group(7), rmem=int(m.group(8)), rtime=__proc_walltime__(m.group(9)), jstat=m.group(10), ctime=m.group(11), node=nodelist) if j.jstat not in jlist.keys(): jlist[j.jstat] = [] jlist[j.jstat].append(j) else: logger.warning('qstat line not parsed: %s' % l) return jlist
def get_cluster_node_properties(node_domain_suffix='dccn.nl', debug=False): '''parse pbsnodes -a to get node properties''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) s = Shell(debug=False) ## get scale factor for node speed cmd = 'hpcutil cluster config | grep NODECFG | grep SPEED | awk "{print $1 $2}"' rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300) speeds = {} re_node_speed = re.compile('^NODECFG\[(.*)\]\s+SPEED=(\S+)$') if rc == 0: for l in output.split('\n'): l = l.strip() m = re_node_speed.match(l) if m: speeds[m.group(1)] = float(m.group(2)) ## get node information nodes = [] cmd = 'ssh torque "pbsnodes -a"' s = Shell(debug=False) rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300) if rc != 0: logger.error('command %s return non-exit code: %d' % (cmd, rc)) else: re_host = re.compile('^(\S+)$') re_jobs = re.compile('^\s+jobs\s+\=\s+(\S+)$') re_stat = re.compile('^\s+state\s+\=\s+(\S+)$') re_np = re.compile('^\s+np\s+\=\s+(\d+)$') re_ngp = re.compile('^\s+gpus\s+\=\s+(\d+)$') re_prop = re.compile('^\s+properties\s+\=\s+(\S+)$') re_mem = re.compile('^ram(\d+)gb$') re_net = re.compile('^network(\S+)$') output = output.split('\n') n = None for l in output: l = l.rstrip() m = re_host.match(l) if m: n = Node( host=m.group(1), # hostname stat='free', # state ncores=1, # ncores ncores_idle=1, # ncores idling ncores_inter=0, # ncores running interactive jobs ncores_matlab=0, # ncores running batch-mode matlab jobs ncores_vgl=0, # ncores running vgl jobs ncores_batch=0, # ncores running batch jobs cpu_type='', # CPU type cpu_speed=1.0, # CPU speed scale mem=1, # memory total memleft=1, # memory left memleft_c=1, # avg. memory left per core ngpus=0, # number of GPUs net='', # network connectivity interactive=False, # node allowing interactive jobs matlab=False, # node allowing matlab batch jobs vgl=False, # node allowing VirtualGL jobs batch=False, # node allowing batch jobs props=[], # other queue properties jobs={}) # jobs and allocated core ids continue m = re_stat.match(l) if m: n.stat = m.group(1) continue m = re_np.match(l) if m: n.ncores = int(m.group(1)) n.ncores_idle = n.ncores continue m = re_prop.match(l) if m: data = m.group(1).split(',') ## TODO: find a better way to get CPU type, as here ## the implementation assumes the first 2 properties ## are always "cpu brand" and "cpu model". For example, ## ## properties = intel,e5-2680 ## ## or ## properties = amd,epyc7351 ## n.cpu_type = ' '.join(data[0:2]) ## try to get the CPU speed factor if available try: n.cpu_speed = speeds[n.host] except KeyError, e: pass for d in data[2:]: mm = re_mem.match(d) if mm: n.mem = int(mm.group(1)) n.memleft = n.mem continue mm = re_net.match(d) if mm: n.net = mm.group(1) continue n.props.append(d) ## update job type support according to node properties n.interactive = 'interactive' in n.props n.matlab = 'matlab' in n.props n.vgl = 'vgl' in n.props n.batch = 'batch' in n.props continue m = re_jobs.match(l) if m: #jobs = 0-3/18316136[3].dccn-l029.dccn.nl,1,4-7/18316136[4].dccn-l029.dccn.nl,... for job_str in m.group(1).replace(node_domain_suffix + ',', node_domain_suffix + ':').split(':'): job_data = job_str.split('/') job_id = job_data[1].split('.')[0] if job_id not in n.jobs.keys(): n.jobs[job_id] = [] for id_data in job_data[0].split(','): id_beg = int(id_data.split('-')[0]) id_end = int(id_data.split('-')[-1]) + 1 n.jobs[job_id] += range(id_beg, id_end) continue m = re_ngp.match(l) if m: n.ngpus = int(m.group(1)) continue if l == '': if n not in nodes: ## avoid duplicat node entry n.memleft_c = float(n.mem / n.ncores) nodes.append(n) continue
def get_mentat_node_properties(debug=False): '''get memtat node properties (memory, ncores, network, no. active VNC sessions)''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) conv_mem_gb = {'kB': 1024**2, 'mB': 1024} s = Shell(debug=False) ## get memtat node properties ## - node name ## - number of cores ## - total memory ## - number of VNC sessions ## - top 5 processes according to CPU utilization cmd = 'cluster-ssh -m "grep processor /proc/cpuinfo | wc -l | xargs echo \'ncores: \'; grep MemTotal /proc/meminfo; ps aux | grep Xvnc | grep -v grep | wc -l | xargs echo \'VNC sessions: \'; cat /proc/loadavg | xargs echo \'Load average: \'; ps -eo pcpu,pmem,pid,user,etime,args | grep -v \'ps -eo pcpu,pmem,pid,user,etime,args\' | sort -n -k 1 -r | grep -v \'%CPU\' | head -5"' rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300) re_node_name = re.compile('^\-*\s+(\S+)\s+\-*$') re_ncores = re.compile('^ncores:\s+(\d+)$') re_memory = re.compile('^MemTotal:\s+((\d+)\s+(\S+))$') re_nxvnc = re.compile('^VNC sessions:\s+(\d+)$') re_loadavg = re.compile( '^Load average:\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|/]+)\s+.*$' ) re_top_ps = re.compile('^[\d|\.]+\s+[\d|\.]+\s+[\d]+.*$') nodes = [] if rc not in [0, 255]: logger.error('command \'%s\' return non-exit code: %d' % (cmd, rc)) else: for l in output.split('\n'): logger.debug(l) l = l.strip() m = re_node_name.match(l) if m: n = Node( host=m.group(1), ## hostname ncores=0, ## number of CPU cores mem=0., ## total physical memory nxvnc=0, ## number of Xvnc session load_1m=0, ## 1 min. load average load_5m=0, ## 5 min. load average load_10m=0, ## 10 min. load average total_ps=0, ## total processes top_ps=[]) ## top processes nodes.append(n) continue m = re_ncores.match(l) if m: nodes[-1].ncores = int(m.group(1)) continue m = re_memory.match(l) if m: nodes[-1].mem = math.ceil( float(m.group(2)) / conv_mem_gb[m.group(3)]) continue m = re_nxvnc.match(l) if m: nodes[-1].nxvnc = int(m.group(1)) continue m = re_loadavg.match(l) if m: nodes[-1].load_1m = float(m.group(1)) nodes[-1].load_5m = float(m.group(2)) nodes[-1].load_10m = float(m.group(3)) nodes[-1].total_ps = int(m.group(4).split('/')[1]) continue m = re_top_ps.match(l) if m: nodes[-1].top_ps.append(l) continue return nodes
def setLogLevel(self, lvl): self.logger = getMyLogger('MTRunner', lvl)
def setLogLevel(self, lvl): self._lvl = lvl self.logger = getMyLogger('MTRunner', lvl) for t in self._agents: t.setLogLevel(lvl)
from Common import getMyLogger import time logger = getMyLogger('GangaThreadPool') class GangaThreadPool: _attributes = ('SHUTDOWN_TIMEOUT') ## GangaThreadPool singleton instance _instance = None class SingletonHelper: def __call__(self, *args, **kw): if GangaThreadPool._instance is None: object = GangaThreadPool() GangaThreadPool._instance = object return GangaThreadPool._instance getInstance = SingletonHelper() def __init__(self): if not GangaThreadPool._instance == None: raise RuntimeError, 'Only one instance of GangaThreadPool is allowed!' GangaThreadPool._instance = self
def get_complete_jobs(logdir, date, debug=False): '''gets all completed jobs on the given date expressed in format of %Y%m%d (i.e. 20140130)''' def __convert_memory__(mymem): '''check if memory type is specified else default to mb''' gb_mem = None scale = {'b': 1024**3, 'kb': 1024**2, 'mb':1024, 'gb': 1 } re_mem = re.compile('^([0-9]*)([m,k,g]{0,1}b{0,1})$') m = re_mem.match(mymem) if m: size = float( m.group(1) ) unit = m.group(2) if not unit: unit = 'b' gb_mem = size / scale[unit] return gb_mem def __readfix_xml__(myfile): '''read and parse the torque log (XML) file''' import xmltodict # open xml file and do some fixing temp = open(myfile, 'r').read() # fix incorrect closing tag temp = temp.replace('JobId', 'Job_Id') # fix the fact that there is no overarching beginning and end tag. temp = '<data>\n' + temp + '\n</data>' # read xml string xmldoc = xmltodict.parse(temp) # return list containing subdict for each job cjobs = xmldoc['data']['Jobinfo'] # if there is only one JobInfo block, the data is not put into a list # to make it consistent, we put data into a list if not isinstance( cjobs, list ): cjobs = [ cjobs ] return cjobs ## get list of XML files corresponding to the jobs from the given date xmlfiles = glob.glob( os.path.join(logdir, date) + '*' ) jlist = [] logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) for f in xmlfiles: logger.debug('parsing logfile: %s' % f) cjobs = __readfix_xml__(f) for j in cjobs: o = Job( jid = j['Job_Id'], # torque job id jname = None, # torque job name jstat = None, # torque job status jec = None, # job exit code cstat = 'unknown', # category status interpreted from jec uid = None, # job owner gid = None, # job owner's group id queue = None, # job queue rmem = 0, # requested memory in byte rwtime = 0, # requested wall-clock time in second htypes = None, # the Job's Hold_Types jpath = None, # the Job's Join_Path cmem = None, # consumed physical memory in byte cvmem = None, # consumed virtual memory in byte cwtime = None, # consumed wall-clock time in second cctime = None, # consumed CPU time in second node = None, # compute node host t_submit = None, # timestamp for job being submitted to Torque t_queue = None, # timestamp for job being scheduled in the queue t_start = None, # timestamp for job being started on execution node t_finish = None # timestamp for job being completed ) ## handles the retried jobs (seperate entry in log file with same job id) is_newjob = True try: o = jlist[ jlist.index(o) ] is_newjob = False logger.warning('job already presented in list: %s' % o.jid) except: pass ## attributes may not be available ## - resource requirement try: o.jname = j['Job_Name'] except KeyError,e: logger.warning('cannot find "Job_Name" for job %s' % o.jid) ## - resource requirement try: o.rmem = __convert_memory__( j['Resource_List']['mem'] ) o.rwtime = int( j['Resource_List']['walltime'] ) except KeyError,e: logger.warning('cannot find "Resource_List" for job %s' % o.jid) except TypeError,e: logger.warning('empty "Resource_List" for job %s' % o.jid)
def get_qstat_jobs(s_cmd, node_domain_suffix='dccn.nl', debug=False): '''run cluster-qstat to get all job status and convert the output into job info dictionary''' def __proc_walltime__(mytime): '''convert walltime to sum of minutes''' minutes = int(mytime.split(':')[0]) * 60 + int(mytime.split(':')[1]) return minutes logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) jlist = {} s = Shell(debug=False) rc, output, m = s.cmd1(s_cmd, allowed_exit=[0,255], timeout=300) re_jinfo = re.compile ( '^(\S+)\s+' + # job id '(\w+)\s+' + # user id '(\w+)\s+' + # queue name '(\S+)\s+' + # job name '([-,\d]+)\s+' + # session id '([-,\d]+)\s+' + # NDS '([-,\d,\S+]+)\s+' + # TSK '(\d+)gb\s+' + # requested physical memory '(\d+:\d+:\d+)\s+' + # requested wall-clock time '([A-Z]+)\s+' + # job status (expecting Q,R,C,H,E only) '(\d+:\d+:\d+|-{2,})\s+' + # wall-clock time consumption '(.*)$' ) # computer node and session #'((((dccn-c\d+)/(\d+\+?))|-{2,}){1,})$' ) # computer node and session if rc != 0: logger.error('command %s return non-exit code: %d' % (s_cmd, rc)) else: def __apply_domain_suffix__(node_hostname): host = node_hostname.split('/')[0] if host != '--': host += '.' + node_domain_suffix return host for l in output.split('\n'): l = l.strip() m = re_jinfo.match(l) if m: nodelist = [''] if m.group(12) != '==': nodelist = map( lambda x:__apply_domain_suffix__(x), m.group(12).split('+')) j = Job( jid = m.group(1) , uid = m.group(2) , queue = m.group(3) , jname = m.group(4) , sid = m.group(5) , nds = m.group(6) , tsk = m.group(7) , rmem = int(m.group(8)) , rtime = __proc_walltime__(m.group(9)), jstat = m.group(10) , ctime = m.group(11) , node = nodelist ) if j.jstat not in jlist.keys(): jlist[j.jstat] = [] jlist[j.jstat].append(j) return jlist
def get_cluster_node_properties(debug=False): '''parse pbsnodes -a to get node properties''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) s = Shell(debug=False) ## get scale factor for node speed cmd = 'cluster-torqueconfig | grep NODECFG | grep SPEED | awk "{print $1 $2}"' rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300) speeds = {} re_node_speed = re.compile('^NODECFG\[(.*)\]\s+SPEED=(\S+)$') if rc == 0: for l in output.split('\n'): l = l.strip() m = re_node_speed.match(l) if m: speeds[m.group(1)] = float(m.group(2)) ## get node information nodes = [] cmd = 'pbsnodes -a' s = Shell(debug=False) rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300) if rc != 0: logger.error('command %s return non-exit code: %d' % (cmd, rc)) else: re_host = re.compile('^(\S+)$') re_stat = re.compile('^\s+state\s+\=\s+(\S+)$') re_np = re.compile('^\s+np\s+\=\s+(\d+)$') re_prop = re.compile('^\s+properties\s+\=\s+(\S+)$') re_mem = re.compile('^ram(\d+)gb$') re_net = re.compile('^network(\S+)$') output = output.split('\n') n = None for l in output: l = l.rstrip() m = re_host.match(l) if m: n = Node(host = m.group(1), # hostname stat = 'free', # state ncores = 1, # ncores ncores_idle = 1, # ncores idling ncores_inter = 0, # ncores running interactive jobs ncores_matlab = 0, # ncores running batch-mode matlab jobs ncores_vgl = 0, # ncores running vgl jobs ncores_batch = 0, # ncores running batch jobs cpu_type = '', # CPU type cpu_speed = 1.0, # CPU speed scale mem = 1, # memory total memleft = 1, # memory left memleft_c = 1, # avg. memory left per core net = '', # network connectivity interactive = False, # node allowing interactive jobs matlab = False, # node allowing matlab batch jobs vgl = False, # node allowing VirtualGL jobs batch = False, # node allowing batch jobs props = []) # other queue properties continue m = re_stat.match(l) if m: n.stat = m.group(1) continue m = re_np.match(l) if m: n.ncores = int(m.group(1)) n.ncores_idle = n.ncores continue m = re_prop.match(l) if m: data = m.group(1).split(',') ## TODO: find a better way to get CPU type n.cpu_type = ' '.join(data[0:2]) ## try to get the CPU speed factor if available try: n.cpu_speed = speeds[n.host] except KeyError, e: pass for d in data[2:]: mm = re_mem.match(d) if mm: n.mem = int( mm.group(1) ) n.memleft = n.mem continue mm = re_net.match(d) if mm: n.net = mm.group(1) continue n.props.append(d) ## update job type support according to node properties n.interactive = 'interactive' in n.props n.matlab = 'matlab' in n.props n.vgl = 'vgl' in n.props n.batch = 'batch' in n.props continue if l == '': if n not in nodes: ## avoid duplicat node entry n.memleft_c = float( n.mem / n.ncores ) nodes.append( n ) continue
def get_mentat_node_properties(debug=False): '''get memtat node properties (memory, ncores, network, no. active VNC sessions)''' logger = getMyLogger(os.path.basename(__file__)) if debug: logger.setLevel(logging.DEBUG) conv_mem_gb = { 'kB': 1024**2, 'mB': 1024 } s = Shell(debug=False) ## get memtat node properties ## - node name ## - number of cores ## - total memory ## - number of VNC sessions ## - top 5 processes according to CPU utilization cmd = 'cluster-ssh -m "grep processor /proc/cpuinfo | wc -l | xargs echo \'ncores: \'; grep MemTotal /proc/meminfo; ps aux | grep Xvnc | grep -v grep | wc -l | xargs echo \'VNC sessions: \'; cat /proc/loadavg | xargs echo \'Load average: \'; ps -eo pcpu,pmem,pid,user,etime,args | grep -v \'ps -eo pcpu,pmem,pid,user,etime,args\' | sort -n -k 1 -r | grep -v \'%CPU\' | head -5"' rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300) re_node_name = re.compile('^\-*\s+(\S+)\s+\-*$') re_ncores = re.compile('^ncores:\s+(\d+)$') re_memory = re.compile('^MemTotal:\s+((\d+)\s+(\S+))$') re_nxvnc = re.compile('^VNC sessions:\s+(\d+)$') re_loadavg = re.compile('^Load average:\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|/]+)\s+.*$') re_top_ps = re.compile('^[\d|\.]+\s+[\d|\.]+\s+[\d]+.*$') nodes = [] if rc not in [0,255]: logger.error('command \'%s\' return non-exit code: %d' % (cmd, rc)) else: for l in output.split('\n'): logger.debug(l) l = l.strip() m = re_node_name.match(l) if m: n = Node(host = m.group(1), ## hostname ncores = 0, ## number of CPU cores mem = 0., ## total physical memory nxvnc = 0, ## number of Xvnc session load_1m = 0, ## 1 min. load average load_5m = 0, ## 5 min. load average load_10m = 0, ## 10 min. load average total_ps = 0, ## total processes top_ps = []) ## top processes nodes.append( n ) continue m = re_ncores.match(l) if m: nodes[-1].ncores = int( m.group(1) ) continue m = re_memory.match(l) if m: nodes[-1].mem = math.ceil( float( m.group(2) ) / conv_mem_gb[ m.group(3) ] ) continue m = re_nxvnc.match(l) if m: nodes[-1].nxvnc = int( m.group(1) ) continue m = re_loadavg.match(l) if m: nodes[-1].load_1m = float( m.group(1) ) nodes[-1].load_5m = float( m.group(2) ) nodes[-1].load_10m = float( m.group(3) ) nodes[-1].total_ps = int( m.group(4).split('/')[1] ) continue m = re_top_ps.match(l) if m: nodes[-1].top_ps.append(l) continue return nodes