示例#1
0
def get_fs(s_cmd, debug=False):
    '''run cluster-faireshare to get current fairshare per USER/GROUP/CLASS/QoS'''

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    fs = {}

    s = Shell(debug=False)
    rc, output, m = s.cmd1(s_cmd, allowed_exit=[0,255], timeout=300)

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (cmd, rc))
    else:
        re_user_beg  = re.compile('^USER$')
        re_group_beg = re.compile('^GROUP$')
        re_class_beg = re.compile('^CLASS$')
 
        idx = {'user':[],'group':[],'class':[]}
        k   = None
        output = output.split('\n')
        for i in xrange(len(output)):
 
            if re_user_beg.match(output[i].strip()):
                k = 'user'
                idx[k].append(i+2)
                continue
 
            if re_group_beg.match(output[i].strip()):
                k = 'group'
                idx[k].append(i+2)
                continue
 
            if re_group_beg.match(output[i].strip()):
                k = 'class'
                idx[k].append(i+2)
                continue
 
            if k and output[i].strip() == '':
                idx[k].append(i)
                k = None 
                continue
 
        logger.debug('line indices on %s output for USER/GROUP/CLASS' % s_cmd )
        logger.debug(' |_ %s ' % repr(idx))

        re_skip = re.compile('^DEFAULT')
        for k,v in idx.iteritems():
            fs[k] = {}
            if v:
                for line in output[v[0]:v[1]]:
                    data  = line.strip().split()
                    if not re_skip.match( data[0] ):
                        ## remove the '*' at the tail of userid
                        fs[k][re.sub('\*$','',data[0])] = float(data[1])
    return fs
示例#2
0
def get_fs(s_cmd, debug=False):
    '''run cluster-faireshare to get current fairshare per USER/GROUP/CLASS/QoS'''

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    fs = {}

    s = Shell(debug=False)
    rc, output, m = s.cmd1(s_cmd, allowed_exit=[0, 255], timeout=300)

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (cmd, rc))
    else:
        re_user_beg = re.compile('^USER$')
        re_group_beg = re.compile('^GROUP$')
        re_class_beg = re.compile('^CLASS$')

        idx = {'user': [], 'group': [], 'class': []}
        k = None
        output = output.split('\n')
        for i in xrange(len(output)):

            if re_user_beg.match(output[i].strip()):
                k = 'user'
                idx[k].append(i + 2)
                continue

            if re_group_beg.match(output[i].strip()):
                k = 'group'
                idx[k].append(i + 2)
                continue

            if re_group_beg.match(output[i].strip()):
                k = 'class'
                idx[k].append(i + 2)
                continue

            if k and output[i].strip() == '':
                idx[k].append(i)
                k = None
                continue

        logger.debug('line indices on %s output for USER/GROUP/CLASS' % s_cmd)
        logger.debug(' |_ %s ' % repr(idx))

        re_skip = re.compile('^DEFAULT')
        for k, v in idx.iteritems():
            fs[k] = {}
            if v:
                for line in output[v[0]:v[1]]:
                    data = line.strip().split()
                    if not re_skip.match(data[0]):
                        ## remove the '*' at the tail of userid
                        fs[k][re.sub('\*$', '', data[0])] = float(data[1])
    return fs
from threading import Thread
from GangaThreadPool import GangaThreadPool
from Common import getMyLogger

logger = getMyLogger('GangaThread')


class GangaThread(Thread):
    def __init__(self, name, auto_register=True, critical=True, **kwds):

        name = 'GANGA_Update_Thread_%s' % name

        Thread.__init__(self, name=name, **kwds)
        self.setDaemon(True)
        self.__should_stop_flag = False
        self.__critical = critical

        if auto_register:
            tpool = GangaThreadPool.getInstance()
            tpool.addServiceThread(self)

    def isCritical(self):
        """Return critical flag.
        
        @return: Boolean critical flag.
        """
        return self.__critical

    def setCritical(self, critical):
        """Set critical flag, which can be used for example in shutdown
        algorithms. See Ganga/Core/__init__.py for example.
示例#4
0
def get_complete_jobs(logdir, date, debug=False):
    '''gets all completed jobs on the given date expressed in format of %Y%m%d (i.e. 20140130)'''
    def __convert_memory__(mymem):
        '''check if memory type is specified else default to mb'''

        gb_mem = None

        scale = {'b': 1024**3, 'kb': 1024**2, 'mb': 1024, 'gb': 1}

        re_mem = re.compile('^([0-9]*)([m,k,g]{0,1}b{0,1})$')

        m = re_mem.match(mymem)

        if m:
            size = float(m.group(1))
            unit = m.group(2)
            if not unit:
                unit = 'b'

            gb_mem = size / scale[unit]

        return gb_mem

    def __readfix_xml__(myfile):
        '''read and parse the torque log (XML) file'''
        import xmltodict

        # open xml file and do some fixing
        temp = open(myfile, 'r').read()
        # fix incorrect closing tag
        #temp = temp.replace('JobId', 'Job_Id')
        temp = re.sub(r'<Variable_List>.*</Variable_List>', '',
                      temp.replace('JobId', 'Job_Id'))
        # fix the fact that there is no overarching beginning and end tag.
        temp = '<data>\n' + temp + '\n</data>'

        # read xml string
        xmldoc = xmltodict.parse(temp)

        # return list containing subdict for each job
        cjobs = xmldoc['data']['Jobinfo']

        # if there is only one JobInfo block, the data is not put into a list
        # to make it consistent, we put data into a list
        if not isinstance(cjobs, list):
            cjobs = [cjobs]

        return cjobs

    ## get list of XML files corresponding to the jobs from the given date
    xmlfiles = glob.glob(os.path.join(logdir, date) + '*')

    jlist = []

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    for f in xmlfiles:

        logger.debug('parsing logfile: %s' % f)

        cjobs = __readfix_xml__(f)

        for j in cjobs:

            o = Job(
                jid=j['Job_Id'],  # torque job id
                jname=None,  # torque job name
                jstat=None,  # torque job status
                jec=None,  # job exit code
                cstat='unknown',  # category status interpreted from jec 
                uid=None,  # job owner
                gid=None,  # job owner's group id
                queue=None,  # job queue
                rmem=0,  # requested memory in byte 
                rwtime=0,  # requested wall-clock time in second
                htypes=None,  # the Job's Hold_Types 
                jpath=None,  # the Job's Join_Path 
                cmem=None,  # consumed physical memory in byte
                cvmem=None,  # consumed virtual memory in byte
                cwtime=None,  # consumed wall-clock time in second
                cctime=None,  # consumed CPU time in second
                node=None,  # compute node host
                t_submit=None,  # timestamp for job being submitted to Torque
                t_queue=None,  # timestamp for job being scheduled in the queue 
                t_start=
                None,  # timestamp for job being started on execution node 
                t_finish=None  # timestamp for job being completed 
            )

            ## handles the retried jobs (seperate entry in log file with same job id)
            is_newjob = True
            try:
                o = jlist[jlist.index(o)]
                is_newjob = False
                logger.warning('job already presented in list: %s' % o.jid)
            except:
                pass

            ## attributes may not be available
            ## - resource requirement
            try:
                o.jname = j['Job_Name']
            except KeyError, e:
                logger.warning('cannot find "Job_Name" for job %s' % o.jid)

            ## - resource requirement
            try:
                o.rmem = __convert_memory__(j['Resource_List']['mem'])
                o.rwtime = int(j['Resource_List']['walltime'])
            except KeyError, e:
                logger.warning('cannot find "Resource_List" for job %s' %
                               o.jid)
            except TypeError, e:
                logger.warning('empty "Resource_List" for job %s' % o.jid)
示例#5
0
def get_qstat_jobs(s_cmd, node_domain_suffix='dccn.nl', debug=False):
    '''run cluster-qstat to get all job status and convert the output into job info dictionary'''

    print s_cmd

    def __proc_walltime__(mytime):
        '''convert walltime to sum of minutes'''
        minutes = int(mytime.split(':')[0]) * 60 + int(mytime.split(':')[1])
        return minutes

    logger = getMyLogger(os.path.basename(__file__))
    if debug:
        logger.setLevel(logging.DEBUG)

    jlist = {}

    s = Shell(debug=False)
    rc, output, m = s.cmd1(s_cmd, allowed_exit=[0, 255], timeout=300)

    re_jinfo = re.compile(
        '^([0-9\[\]]+)\.\S+\s+' +  # job id
        '(\w+)\s+' +  # user id
        '(\w+)\s+' +  # queue name
        '(\S+)\s+' +  # job name
        '([-,\d]+)\s+' +  # session id
        '([-,\d]+)\s+' +  # NDS
        '([-,\d,\S+]+)\s+' +  # TSK
        '(\d+)gb\s+' +  # requested physical memory 
        '(\d+:\d+:\d+)\s+' +  # requested wall-clock time
        '([A-Z]+)\s+' +  # job status (expecting Q,R,C,H,E only)
        '(\d+:\d+:\d+|-{2,})\s+' +  # wall-clock time consumption
        '(.*)$')  # computer node and session
    #'((((dccn-c\d+)/(\d+\+?))|-{2,}){1,})$' )   # computer node and session

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (s_cmd, rc))
    else:

        def __apply_domain_suffix__(node_hostname):
            host = node_hostname.split('/')[0]
            if host != '--':
                host += '.' + node_domain_suffix
            return host

        for l in output.split('\n'):
            l = l.strip()
            m = re_jinfo.match(l)

            if m:
                nodelist = []
                if m.group(12) != '--':
                    nodelist = map(lambda x: __apply_domain_suffix__(x),
                                   m.group(12).split('+'))

                j = Job(jid=m.group(1),
                        uid=m.group(2),
                        queue=m.group(3),
                        jname=m.group(4),
                        sid=m.group(5),
                        nds=m.group(6),
                        tsk=m.group(7),
                        rmem=int(m.group(8)),
                        rtime=__proc_walltime__(m.group(9)),
                        jstat=m.group(10),
                        ctime=m.group(11),
                        node=nodelist)

                if j.jstat not in jlist.keys():
                    jlist[j.jstat] = []

                jlist[j.jstat].append(j)
            else:
                logger.warning('qstat line not parsed: %s' % l)

    return jlist
示例#6
0
def get_cluster_node_properties(node_domain_suffix='dccn.nl', debug=False):
    '''parse pbsnodes -a to get node properties'''

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    s = Shell(debug=False)

    ## get scale factor for node speed
    cmd = 'hpcutil cluster config | grep NODECFG | grep SPEED | awk "{print $1 $2}"'
    rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300)

    speeds = {}
    re_node_speed = re.compile('^NODECFG\[(.*)\]\s+SPEED=(\S+)$')
    if rc == 0:
        for l in output.split('\n'):
            l = l.strip()
            m = re_node_speed.match(l)
            if m:
                speeds[m.group(1)] = float(m.group(2))

    ## get node information
    nodes = []

    cmd = 'ssh torque "pbsnodes -a"'
    s = Shell(debug=False)
    rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300)

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (cmd, rc))
    else:
        re_host = re.compile('^(\S+)$')
        re_jobs = re.compile('^\s+jobs\s+\=\s+(\S+)$')
        re_stat = re.compile('^\s+state\s+\=\s+(\S+)$')
        re_np = re.compile('^\s+np\s+\=\s+(\d+)$')
        re_ngp = re.compile('^\s+gpus\s+\=\s+(\d+)$')
        re_prop = re.compile('^\s+properties\s+\=\s+(\S+)$')
        re_mem = re.compile('^ram(\d+)gb$')
        re_net = re.compile('^network(\S+)$')

        output = output.split('\n')

        n = None
        for l in output:

            l = l.rstrip()

            m = re_host.match(l)
            if m:
                n = Node(
                    host=m.group(1),  # hostname
                    stat='free',  # state
                    ncores=1,  # ncores
                    ncores_idle=1,  # ncores idling
                    ncores_inter=0,  # ncores running interactive jobs 
                    ncores_matlab=0,  # ncores running batch-mode matlab jobs 
                    ncores_vgl=0,  # ncores running vgl jobs
                    ncores_batch=0,  # ncores running batch jobs
                    cpu_type='',  # CPU type
                    cpu_speed=1.0,  # CPU speed scale 
                    mem=1,  # memory total
                    memleft=1,  # memory left
                    memleft_c=1,  # avg. memory left per core
                    ngpus=0,  # number of GPUs
                    net='',  # network connectivity
                    interactive=False,  # node allowing interactive jobs 
                    matlab=False,  # node allowing matlab batch jobs 
                    vgl=False,  # node allowing VirtualGL jobs 
                    batch=False,  # node allowing batch jobs 
                    props=[],  # other queue properties
                    jobs={})  # jobs and allocated core ids
                continue

            m = re_stat.match(l)
            if m:
                n.stat = m.group(1)
                continue

            m = re_np.match(l)
            if m:
                n.ncores = int(m.group(1))
                n.ncores_idle = n.ncores
                continue

            m = re_prop.match(l)
            if m:
                data = m.group(1).split(',')

                ## TODO: find a better way to get CPU type, as here
                ##       the implementation assumes the first 2 properties
                ##       are always "cpu brand" and "cpu model". For example,
                ##
                ##           properties = intel,e5-2680
                ##
                ##       or
                ##           properties = amd,epyc7351
                ##
                n.cpu_type = ' '.join(data[0:2])

                ## try to get the CPU speed factor if available
                try:
                    n.cpu_speed = speeds[n.host]
                except KeyError, e:
                    pass

                for d in data[2:]:
                    mm = re_mem.match(d)
                    if mm:
                        n.mem = int(mm.group(1))
                        n.memleft = n.mem
                        continue
                    mm = re_net.match(d)
                    if mm:
                        n.net = mm.group(1)
                        continue

                    n.props.append(d)

                ## update job type support according to node properties
                n.interactive = 'interactive' in n.props
                n.matlab = 'matlab' in n.props
                n.vgl = 'vgl' in n.props
                n.batch = 'batch' in n.props

                continue

            m = re_jobs.match(l)
            if m:
                #jobs = 0-3/18316136[3].dccn-l029.dccn.nl,1,4-7/18316136[4].dccn-l029.dccn.nl,...
                for job_str in m.group(1).replace(node_domain_suffix + ',',
                                                  node_domain_suffix +
                                                  ':').split(':'):
                    job_data = job_str.split('/')
                    job_id = job_data[1].split('.')[0]
                    if job_id not in n.jobs.keys():
                        n.jobs[job_id] = []
                    for id_data in job_data[0].split(','):
                        id_beg = int(id_data.split('-')[0])
                        id_end = int(id_data.split('-')[-1]) + 1
                        n.jobs[job_id] += range(id_beg, id_end)
                continue

            m = re_ngp.match(l)
            if m:
                n.ngpus = int(m.group(1))
                continue

            if l == '':
                if n not in nodes:  ## avoid duplicat node entry
                    n.memleft_c = float(n.mem / n.ncores)
                    nodes.append(n)
                continue
示例#7
0
def get_mentat_node_properties(debug=False):
    '''get memtat node properties (memory, ncores, network, no. active VNC sessions)'''
    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    conv_mem_gb = {'kB': 1024**2, 'mB': 1024}

    s = Shell(debug=False)

    ## get memtat node properties
    ##  - node name
    ##  - number of cores
    ##  - total memory
    ##  - number of VNC sessions
    ##  - top 5 processes according to CPU utilization
    cmd = 'cluster-ssh -m "grep processor /proc/cpuinfo | wc -l | xargs echo \'ncores: \'; grep MemTotal /proc/meminfo; ps aux | grep Xvnc | grep -v grep | wc -l | xargs echo \'VNC sessions: \'; cat /proc/loadavg | xargs echo \'Load average: \'; ps -eo pcpu,pmem,pid,user,etime,args | grep -v \'ps -eo pcpu,pmem,pid,user,etime,args\' | sort -n -k 1 -r | grep -v \'%CPU\' | head -5"'
    rc, output, m = s.cmd1(cmd, allowed_exit=[0, 255], timeout=300)

    re_node_name = re.compile('^\-*\s+(\S+)\s+\-*$')
    re_ncores = re.compile('^ncores:\s+(\d+)$')
    re_memory = re.compile('^MemTotal:\s+((\d+)\s+(\S+))$')
    re_nxvnc = re.compile('^VNC sessions:\s+(\d+)$')
    re_loadavg = re.compile(
        '^Load average:\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|/]+)\s+.*$'
    )
    re_top_ps = re.compile('^[\d|\.]+\s+[\d|\.]+\s+[\d]+.*$')
    nodes = []

    if rc not in [0, 255]:
        logger.error('command \'%s\' return non-exit code: %d' % (cmd, rc))
    else:
        for l in output.split('\n'):

            logger.debug(l)

            l = l.strip()

            m = re_node_name.match(l)
            if m:
                n = Node(
                    host=m.group(1),  ## hostname
                    ncores=0,  ## number of CPU cores
                    mem=0.,  ## total physical memory
                    nxvnc=0,  ## number of Xvnc session
                    load_1m=0,  ## 1 min. load average 
                    load_5m=0,  ## 5 min. load average 
                    load_10m=0,  ## 10 min. load average 
                    total_ps=0,  ## total processes 
                    top_ps=[])  ## top processes

                nodes.append(n)
                continue

            m = re_ncores.match(l)
            if m:
                nodes[-1].ncores = int(m.group(1))
                continue

            m = re_memory.match(l)
            if m:
                nodes[-1].mem = math.ceil(
                    float(m.group(2)) / conv_mem_gb[m.group(3)])
                continue

            m = re_nxvnc.match(l)
            if m:
                nodes[-1].nxvnc = int(m.group(1))
                continue

            m = re_loadavg.match(l)
            if m:
                nodes[-1].load_1m = float(m.group(1))
                nodes[-1].load_5m = float(m.group(2))
                nodes[-1].load_10m = float(m.group(3))
                nodes[-1].total_ps = int(m.group(4).split('/')[1])
                continue

            m = re_top_ps.match(l)
            if m:
                nodes[-1].top_ps.append(l)
                continue

    return nodes
示例#8
0
 def setLogLevel(self, lvl):
     self.logger = getMyLogger('MTRunner', lvl)
示例#9
0
 def setLogLevel(self, lvl):
     self._lvl = lvl
     self.logger = getMyLogger('MTRunner', lvl)
     for t in self._agents:
         t.setLogLevel(lvl)
from Common import getMyLogger
import time

logger = getMyLogger('GangaThreadPool')


class GangaThreadPool:

    _attributes = ('SHUTDOWN_TIMEOUT')

    ## GangaThreadPool singleton instance
    _instance = None

    class SingletonHelper:
        def __call__(self, *args, **kw):

            if GangaThreadPool._instance is None:
                object = GangaThreadPool()
                GangaThreadPool._instance = object

            return GangaThreadPool._instance

    getInstance = SingletonHelper()

    def __init__(self):

        if not GangaThreadPool._instance == None:
            raise RuntimeError, 'Only one instance of GangaThreadPool is allowed!'

        GangaThreadPool._instance = self
示例#11
0
def get_complete_jobs(logdir, date, debug=False):
    '''gets all completed jobs on the given date expressed in format of %Y%m%d (i.e. 20140130)'''

    def __convert_memory__(mymem):
        '''check if memory type is specified else default to mb'''

        gb_mem = None

        scale = {'b': 1024**3, 'kb': 1024**2, 'mb':1024, 'gb': 1 }

        re_mem = re.compile('^([0-9]*)([m,k,g]{0,1}b{0,1})$')

        m = re_mem.match(mymem)

        if m:
            size   = float( m.group(1) )
            unit   = m.group(2)
            if not unit:
                unit = 'b'

            gb_mem = size / scale[unit]

        return gb_mem

    def __readfix_xml__(myfile):
        '''read and parse the torque log (XML) file'''
        import xmltodict
 
        # open xml file and do some fixing
        temp = open(myfile, 'r').read()
        # fix incorrect closing tag
        temp = temp.replace('JobId', 'Job_Id')
        # fix the fact that there is no overarching beginning and end tag.
        temp = '<data>\n' + temp + '\n</data>'
 
        # read xml string
        xmldoc = xmltodict.parse(temp)
 
        # return list containing subdict for each job
        cjobs = xmldoc['data']['Jobinfo']
 
        # if there is only one JobInfo block, the data is not put into a list
        # to make it consistent, we put data into a list
        if not isinstance( cjobs, list ):
            cjobs = [ cjobs ]

        return cjobs

    ## get list of XML files corresponding to the jobs from the given date 
    xmlfiles = glob.glob( os.path.join(logdir, date) + '*' )

    jlist = []

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    for f in xmlfiles:

        logger.debug('parsing logfile: %s' % f)

        cjobs = __readfix_xml__(f)

        for j in cjobs:

            o = Job( jid      = j['Job_Id'],                  # torque job id
                     jname    = None,                         # torque job name
                     jstat    = None,                         # torque job status
                     jec      = None,                         # job exit code
                     cstat    = 'unknown',                    # category status interpreted from jec 
                     uid      = None,                         # job owner
                     gid      = None,                         # job owner's group id
                     queue    = None,                         # job queue
                     rmem     = 0,                            # requested memory in byte 
                     rwtime   = 0,                            # requested wall-clock time in second
                     htypes   = None,                         # the Job's Hold_Types 
                     jpath    = None,                         # the Job's Join_Path 
                     cmem     = None,                         # consumed physical memory in byte
                     cvmem    = None,                         # consumed virtual memory in byte
                     cwtime   = None,                         # consumed wall-clock time in second
                     cctime   = None,                         # consumed CPU time in second
                     node     = None,                         # compute node host
                     t_submit = None,                         # timestamp for job being submitted to Torque
                     t_queue  = None,                         # timestamp for job being scheduled in the queue 
                     t_start  = None,                         # timestamp for job being started on execution node 
                     t_finish = None                          # timestamp for job being completed 
                   )
           
            ## handles the retried jobs (seperate entry in log file with same job id)
            is_newjob = True
            try:
                o = jlist[ jlist.index(o) ]
                is_newjob = False
                logger.warning('job already presented in list: %s' % o.jid)
            except:
                pass
 
            ## attributes may not be available 
            ## - resource requirement
            try:
                o.jname  = j['Job_Name']
            except KeyError,e:
                logger.warning('cannot find "Job_Name" for job %s' % o.jid)

            ## - resource requirement
            try:
                o.rmem   = __convert_memory__( j['Resource_List']['mem'] )
                o.rwtime = int( j['Resource_List']['walltime'] )
            except KeyError,e:
                logger.warning('cannot find "Resource_List" for job %s' % o.jid)
            except TypeError,e:
                logger.warning('empty "Resource_List" for job %s' % o.jid)
示例#12
0
def get_qstat_jobs(s_cmd, node_domain_suffix='dccn.nl', debug=False):
    '''run cluster-qstat to get all job status and convert the output into job info dictionary'''

    def __proc_walltime__(mytime):
        '''convert walltime to sum of minutes'''
        minutes = int(mytime.split(':')[0]) * 60 + int(mytime.split(':')[1])
        return minutes

    logger = getMyLogger(os.path.basename(__file__))
    if debug:
        logger.setLevel(logging.DEBUG)

    jlist = {}

    s = Shell(debug=False)
    rc, output, m = s.cmd1(s_cmd, allowed_exit=[0,255], timeout=300)

    re_jinfo  = re.compile ( '^(\S+)\s+'                  +    # job id
                             '(\w+)\s+'                   +    # user id
                             '(\w+)\s+'                   +    # queue name
                             '(\S+)\s+'                   +    # job name
                             '([-,\d]+)\s+'               +    # session id
                             '([-,\d]+)\s+'               +    # NDS
                             '([-,\d,\S+]+)\s+'           +    # TSK
                             '(\d+)gb\s+'                 +    # requested physical memory 
                             '(\d+:\d+:\d+)\s+'           +    # requested wall-clock time
                             '([A-Z]+)\s+'                +    # job status (expecting Q,R,C,H,E only)
                             '(\d+:\d+:\d+|-{2,})\s+'     +    # wall-clock time consumption
                             '(.*)$' )                         # computer node and session
                             #'((((dccn-c\d+)/(\d+\+?))|-{2,}){1,})$' )   # computer node and session

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (s_cmd, rc))
    else:

        def __apply_domain_suffix__(node_hostname):
            host = node_hostname.split('/')[0]
            if host != '--':
                host += '.' + node_domain_suffix
            return host 

        for l in output.split('\n'):
            l = l.strip()
            m = re_jinfo.match(l)

            if m:
                nodelist = ['']
                if m.group(12) != '==':
                    nodelist = map( lambda x:__apply_domain_suffix__(x), m.group(12).split('+'))

                j = Job( jid   = m.group(1)               ,
                         uid   = m.group(2)               ,
                         queue = m.group(3)               ,
                         jname = m.group(4)               ,
                         sid   = m.group(5)               ,
                         nds   = m.group(6)               ,
                         tsk   = m.group(7)               ,
                         rmem  = int(m.group(8))          ,
                         rtime = __proc_walltime__(m.group(9)),
                         jstat = m.group(10)              ,
                         ctime = m.group(11)              ,
                         node  = nodelist                 )

                if j.jstat not in jlist.keys():
                    jlist[j.jstat] = []

                jlist[j.jstat].append(j)

    return jlist
示例#13
0
def get_cluster_node_properties(debug=False):
    '''parse pbsnodes -a to get node properties'''

    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    s = Shell(debug=False)

    ## get scale factor for node speed
    cmd = 'cluster-torqueconfig | grep NODECFG | grep SPEED | awk "{print $1 $2}"'
    rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300)

    speeds = {}
    re_node_speed = re.compile('^NODECFG\[(.*)\]\s+SPEED=(\S+)$')
    if rc == 0:
        for l in output.split('\n'):
            l = l.strip()
            m = re_node_speed.match(l)
            if m:
                speeds[m.group(1)] = float(m.group(2))

    ## get node information 
    nodes = []

    cmd = 'pbsnodes -a'
    s = Shell(debug=False)
    rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300)

    if rc != 0:
        logger.error('command %s return non-exit code: %d' % (cmd, rc))
    else:
        re_host = re.compile('^(\S+)$')
        re_stat = re.compile('^\s+state\s+\=\s+(\S+)$')
        re_np   = re.compile('^\s+np\s+\=\s+(\d+)$')
        re_prop = re.compile('^\s+properties\s+\=\s+(\S+)$')
        re_mem  = re.compile('^ram(\d+)gb$')
        re_net  = re.compile('^network(\S+)$')

        output = output.split('\n')

        n = None
        for l in output:

            l = l.rstrip()

            m = re_host.match(l)
            if m:
                n = Node(host          = m.group(1),  # hostname
                         stat          = 'free',      # state
                         ncores        = 1,           # ncores
                         ncores_idle   = 1,           # ncores idling
                         ncores_inter  = 0,           # ncores running interactive jobs 
                         ncores_matlab = 0,           # ncores running batch-mode matlab jobs 
                         ncores_vgl    = 0,           # ncores running vgl jobs
                         ncores_batch  = 0,           # ncores running batch jobs
                         cpu_type      = '',          # CPU type
                         cpu_speed     = 1.0,         # CPU speed scale 
                         mem           = 1,           # memory total
                         memleft       = 1,           # memory left
                         memleft_c     = 1,           # avg. memory left per core
                         net           = '',          # network connectivity
                         interactive   = False,       # node allowing interactive jobs 
                         matlab        = False,       # node allowing matlab batch jobs 
                         vgl           = False,       # node allowing VirtualGL jobs 
                         batch         = False,       # node allowing batch jobs 
                         props         = [])          # other queue properties
                continue
 
            m = re_stat.match(l)
            if m:
                n.stat = m.group(1)
                continue
 
            m = re_np.match(l)
            if m:
                n.ncores      = int(m.group(1))
                n.ncores_idle = n.ncores 
                continue
 
            m = re_prop.match(l)
            if m:
                data = m.group(1).split(',')

                ## TODO: find a better way to get CPU type
                n.cpu_type = ' '.join(data[0:2])

                ## try to get the CPU speed factor if available
                try:
                    n.cpu_speed = speeds[n.host]
                except KeyError, e:
                    pass

                for d in data[2:]:
                    mm = re_mem.match(d)
                    if mm:
                        n.mem     = int( mm.group(1) )
                        n.memleft = n.mem
                        continue
                    mm = re_net.match(d)
                    if mm:
                        n.net = mm.group(1)
                        continue

                    n.props.append(d)

                ## update job type support according to node properties
                n.interactive = 'interactive' in n.props
                n.matlab      = 'matlab'      in n.props
                n.vgl         = 'vgl'         in n.props
                n.batch       = 'batch'       in n.props

                continue

            if l == '':
                if n not in nodes: ## avoid duplicat node entry
                    n.memleft_c = float( n.mem / n.ncores )
                    nodes.append( n )
                continue
示例#14
0
def get_mentat_node_properties(debug=False):
    '''get memtat node properties (memory, ncores, network, no. active VNC sessions)'''
    logger = getMyLogger(os.path.basename(__file__))

    if debug:
        logger.setLevel(logging.DEBUG)

    conv_mem_gb = { 'kB': 1024**2, 'mB': 1024 }

    s = Shell(debug=False)

    ## get memtat node properties 
    ##  - node name
    ##  - number of cores
    ##  - total memory
    ##  - number of VNC sessions
    ##  - top 5 processes according to CPU utilization 
    cmd = 'cluster-ssh -m "grep processor /proc/cpuinfo | wc -l | xargs echo \'ncores: \'; grep MemTotal /proc/meminfo; ps aux | grep Xvnc | grep -v grep | wc -l | xargs echo \'VNC sessions: \'; cat /proc/loadavg | xargs echo \'Load average: \'; ps -eo pcpu,pmem,pid,user,etime,args | grep -v \'ps -eo pcpu,pmem,pid,user,etime,args\' | sort -n -k 1 -r | grep -v \'%CPU\' | head -5"'
    rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300)

    re_node_name = re.compile('^\-*\s+(\S+)\s+\-*$')
    re_ncores    = re.compile('^ncores:\s+(\d+)$')
    re_memory    = re.compile('^MemTotal:\s+((\d+)\s+(\S+))$')
    re_nxvnc     = re.compile('^VNC sessions:\s+(\d+)$')
    re_loadavg   = re.compile('^Load average:\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|\.]+)\s+([\d|/]+)\s+.*$')
    re_top_ps    = re.compile('^[\d|\.]+\s+[\d|\.]+\s+[\d]+.*$')
    nodes = []

    if rc not in [0,255]:
        logger.error('command \'%s\' return non-exit code: %d' % (cmd, rc))
    else:
        for l in output.split('\n'):

            logger.debug(l)

            l = l.strip()

            m = re_node_name.match(l)
            if m:
                n = Node(host     = m.group(1),   ## hostname
                         ncores   = 0,            ## number of CPU cores
                         mem      = 0.,           ## total physical memory
                         nxvnc    = 0,            ## number of Xvnc session
                         load_1m  = 0,            ## 1 min. load average 
                         load_5m  = 0,            ## 5 min. load average 
                         load_10m = 0,            ## 10 min. load average 
                         total_ps = 0,            ## total processes 
                         top_ps   = [])           ## top processes

                nodes.append( n )
                continue

            m = re_ncores.match(l)
            if m:
                nodes[-1].ncores = int( m.group(1) )
                continue

            m = re_memory.match(l)
            if m:
                nodes[-1].mem = math.ceil( float( m.group(2) ) / conv_mem_gb[ m.group(3) ] )
                continue

            m = re_nxvnc.match(l)
            if m:
                nodes[-1].nxvnc = int( m.group(1) )
                continue

            m = re_loadavg.match(l)
            if m:
                nodes[-1].load_1m  = float( m.group(1) ) 
                nodes[-1].load_5m  = float( m.group(2) ) 
                nodes[-1].load_10m = float( m.group(3) ) 
                nodes[-1].total_ps = int( m.group(4).split('/')[1] )
                continue

            m = re_top_ps.match(l)
            if m:
                nodes[-1].top_ps.append(l)
                continue

    return nodes