def connectDbFile(dbFile, mode='read'): result = 'passed' conn = '' if mode == 'write': journalDbFile = str(dbFile) + '-journal' if os.path.exists(journalDbFile) and (mode == 'write'): common.printWarning( '*Warning*: database file "' + str(dbFile) + '" is on another connection, will not connect it.') result = 'locked' return (result, conn) elif mode == 'read': if not os.path.exists(dbFile): common.printError('*Error*: "' + str(dbFile) + '" No such database file.') result = 'failed' return (result, conn) try: conn = sqlite3.connect(dbFile) except Exception as error: common.printError('*Error*: Failed on connecting database file "' + str(dbFile) + '": ' + str(error)) result = 'failed' return (result, conn)
def getQueueHostInfo(): """ Get hosts on (specified) queues. """ queueHostDic = {} queueCompile = re.compile('^QUEUE:\s*(\S+)\s*$') hostsCompile = re.compile('^HOSTS:\s*(.*?)\s*$') queue = '' lines = os.popen('bqueues -l').readlines() for line in lines: line = line.strip() if queueCompile.match(line): myMatch = queueCompile.match(line) queue = myMatch.group(1) queueHostDic[queue] = [] if hostsCompile.match(line): myMatch = hostsCompile.match(line) hostsString = myMatch.group(1) if re.search('all hosts used by the OpenLava system', hostsString): common.printWarning( '*Warning* (getQueueHostInfo) : queue "' + str(queue) + '" is not well configured, all of the hosts are on the same queue.' ) queueHostDic[queue] = getHostList() elif re.match('.+/', hostsString): hostGroupName = re.sub('/$', '', hostsString) queueHostDic[queue] = getHostGroupMembers(hostGroupName) else: queueHostDic[queue] = hostsString.split() return (queueHostDic)
def sampleJobInfo(self): """ Sample job info, especially the memory usage info. """ self.getDateInfo() self.jobDbFile = str(self.dbPath) + '/job.db' self.jobDbConn = sqlite3.connect(self.jobDbFile) self.jobDbCurs = self.jobDbConn.cursor() print('>>> Sampling job info into ' + str(self.jobDbFile) + ' ...') jobTableList = common.getSqlTableList(self.jobDbFile, self.jobDbCurs) bjobsDic = common.getBjobsUfInfo() jobList = list(bjobsDic.keys()) for job in jobList: jobTableName = 'job_' + str(job) print(' Sampling for job "' + str(job) + '" ...') # Insert 'sampleTime', 'DATE', 'TIME' and 'SECONDS' into key list. jobDic = bjobsDic[job] keyList = list(jobDic.keys()) valueList = list(jobDic.values()) valueList = self.addValueDateInfo(valueList) # If job table (with old data) has been on the self.jobDbFile, drop it. if jobTableName in jobTableList: dataDic = common.getSqlTableData(self.jobDbFile, self.jobDbCurs, jobTableName, ['SECONDS']) if dataDic: if len(dataDic['SECONDS']) > 0: lastSeconds = int(dataDic['SECONDS'][-1]) if self.currentSeconds - lastSeconds > 864000: common.printWarning( '*Warning*: table "' + str(jobTableName) + '" already existed even ten day ago, will drop it.' ) common.dropSqlTable(self.jobDbFile, self.jobDbCurs, jobTableName) # If job table is not on the self.jobDbFile, create it. if jobTableName not in jobTableList: keyList = self.addKeyDateInfo(keyList) keyString = common.genSqlTableKeyString(keyList) common.createSqlTable(self.jobDbFile, self.jobDbConn, jobTableName, keyString) # Insert sql table value. valueString = common.genSqlTableValueString(valueList) common.insertIntoSqlTable(self.jobDbFile, self.jobDbConn, jobTableName, valueString) self.jobDbCurs.close() self.jobDbConn.close()
def __init__(self): self.user = getpass.getuser() self.queueDbFile = str(config.dbPath) + '/queue.db' (self.queueDbFileConnectResult, self.queueDbConn) = sqlite3_common.connectDbFile(self.queueDbFile) if self.queueDbFileConnectResult == 'failed': common.printWarning( '*Warning*: Failed on connectiong queue database file "' + str(self.queueDbFile) + '".') self.jobFirstLoad = True self.queueFirstLoad = True
def getCommandDict(command): """ Collect (common) openlava command info into a dict. It only works with the Title-Item type informations. """ myDic = collections.OrderedDict() keyList = [] lines = os.popen(command).readlines() for i in range(len(lines)): line = lines[i].strip() # Some speciall preprocess. if re.search('lsload', command): line = re.sub('\*', ' ', line) if i == 0: keyList = line.split() for key in keyList: myDic[key] = [] else: commandInfo = line.split() if len(commandInfo) < len(keyList): common.printWarning( '*Warning* (getCommandDict) : For command "' + str(command) + '", below info line is incomplate/unexpected.') common.printWarning(' ' + str(line)) for j in range(len(keyList)): key = keyList[j] if j < len(commandInfo): value = commandInfo[j] else: value = '' myDic[key].append(value) return (myDic)
def sampleJobInfo(self): """ Sample job info, especially the memory usage info. """ self.getDateInfo() print('>>> Sampling job info ...') bjobsDic = openlava_common.getBjobsUfInfo() jobList = list(bjobsDic.keys()) jobRangeDic = common.getJobRangeDic(jobList) jobSqlDic = {} keyList = ['sampleTime', 'mem'] for jobRange in jobRangeDic.keys(): jobDbFile = str(self.dbPath) + '/job/' + str(jobRange) + '.db' (result, jobDbConn) = sqlite3_common.connectDbFile(jobDbFile, mode='read') if result == 'passed': jobTableList = sqlite3_common.getSqlTableList( jobDbFile, jobDbConn) else: jobTableList = [] for job in jobRangeDic[jobRange]: jobTableName = 'job_' + str(job) print(' Sampling for job "' + str(job) + '" ...') jobSqlDic[job] = { 'drop': False, 'keyString': '', 'valueString': '', } # If job table (with old data) has been on the jobDbFile, drop it. if jobTableName in jobTableList: dataDic = sqlite3_common.getSqlTableData( jobDbFile, jobDbConn, jobTableName, ['sampleTime']) if dataDic: if len(dataDic['sampleTime']) > 0: lastSampleTime = dataDic['sampleTime'][-1] lastSeconds = int( time.mktime( datetime.datetime.strptime( str(lastSampleTime), "%Y%m%d_%H%M%S").timetuple())) if self.currentSeconds - lastSeconds > 3600: common.printWarning( ' *Warning*: table "' + str(jobTableName) + '" already existed even one hour ago, will drop it.' ) jobSqlDic[job]['drop'] = True jobTableList.remove(jobTableName) # If job table is not on the jobDbFile, create it. if jobTableName not in jobTableList: keyString = sqlite3_common.genSqlTableKeyString(keyList) jobSqlDic[job]['keyString'] = keyString # Insert sql table value. valueList = [self.sampleTime, bjobsDic[job]['mem']] valueString = sqlite3_common.genSqlTableValueString(valueList) jobSqlDic[job]['valueString'] = valueString if result == 'passed': jobDbConn.commit() jobDbConn.close() for jobRange in jobRangeDic.keys(): jobDbFile = str(self.dbPath) + '/job/' + str(jobRange) + '.db' (result, jobDbConn) = sqlite3_common.connectDbFile(jobDbFile, mode='write') if result != 'passed': return for job in jobRangeDic[jobRange]: jobTableName = 'job_' + str(job) if jobSqlDic[job]['drop']: sqlite3_common.dropSqlTable(jobDbFile, jobDbConn, jobTableName, commit=False) if jobSqlDic[job]['keyString'] != '': sqlite3_common.createSqlTable(jobDbFile, jobDbConn, jobTableName, jobSqlDic[job]['keyString'], commit=False) if jobSqlDic[job]['valueString'] != '': sqlite3_common.insertIntoSqlTable( jobDbFile, jobDbConn, jobTableName, jobSqlDic[job]['valueString'], commit=False) jobDbConn.commit() jobDbConn.close() print(' Committing the update to sqlite3 ...') print(' Done (' + str(len(jobList)) + ' jobs).')
def guiWarning(self, warningMessage): """ Show the specified warning message on both of command line and GUI window. """ common.printWarning(warningMessage) QMessageBox.warning(self, 'openlavaMonitor Warning', warningMessage)
def genHostsTabTable(self): self.hostsTabTable.setShowGrid(True) self.hostsTabTable.setSortingEnabled(True) self.hostsTabTable.setRowCount(len(self.hostList)) self.hostsTabTable.setColumnCount(10) self.hostsTabTable.setHorizontalHeaderLabels([ 'Host', 'Status', 'Queue', 'Njobs', 'Ncpus', 'Ut (%)', 'Mem (G)', 'Maxmem (G)', 'swp (G)', 'maxswp (G)' ]) bhostsDic = common.getBhostsInfo() lshostsDic = common.getLshostsInfo() lsloadDic = common.getLsloadInfo() hostQueueDic = common.getHostQueueInfo() for i in range(len(self.hostList)): host = self.hostList[i] j = 0 self.hostsTabTable.setItem(i, j, QTableWidgetItem(host)) j = j + 1 index = bhostsDic['HOST_NAME'].index(host) status = bhostsDic['STATUS'][index] item = QTableWidgetItem(status) if str(status) == 'closed': item.setFont(QFont('song', 10, QFont.Bold)) item.setForeground(QBrush(Qt.red)) self.hostsTabTable.setItem(i, j, item) j = j + 1 if host in hostQueueDic.keys(): queues = ' '.join(hostQueueDic[host]) item = QTableWidgetItem(queues) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = bhostsDic['HOST_NAME'].index(host) njobs = bhostsDic['NJOBS'][index] if not re.match('^[0-9]+$', njobs): common.printWarning('*Warning*: host(' + str(host) + ') NJOBS info "' + str(njobs) + '": invalid value, reset it to "0".') njobs = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(njobs)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lshostsDic['HOST_NAME'].index(host) ncpus = lshostsDic['ncpus'][index] if not re.match('^[0-9]+$', ncpus): common.printWarning('*Warning*: host(' + str(host) + ') ncpus info "' + str(ncpus) + '": invalid value, reset it to "0".') ncpus = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(ncpus)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lsloadDic['HOST_NAME'].index(host) ut = lsloadDic['ut'][index] ut = re.sub('%', '', ut) if not re.match('^[0-9]+$', ut): common.printWarning('*Warning*: host(' + str(host) + ') ut info "' + str(ut) + '": invalid value, reset it to "0".') ut = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(ut)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lsloadDic['HOST_NAME'].index(host) mem = lsloadDic['mem'][index] if re.search('M', mem): mem = re.sub('M', '', mem) mem = int(mem) / 1024 elif re.search('G', mem): mem = re.sub('G', '', mem) else: common.printWarning('*Warning*: host(' + str(host) + ') mem info "' + str(mem) + '": unrecognized unit, reset it to "0".') mem = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(mem)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lshostsDic['HOST_NAME'].index(host) maxmem = lshostsDic['maxmem'][index] if re.search('M', maxmem): maxmem = re.sub('M', '', maxmem) maxmem = int(maxmem) / 1024 elif re.search('G', maxmem): maxmem = re.sub('G', '', maxmem) else: common.printWarning('*Warning*: host(' + str(host) + ') maxmem info "' + str(maxmem) + '": unrecognized unit, reset it to "0".') maxmem = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(maxmem)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lsloadDic['HOST_NAME'].index(host) swp = lsloadDic['swp'][index] if re.search('M', swp): swp = re.sub('M', '', swp) swp = int(swp) / 1024 elif re.search('G', swp): swp = re.sub('G', '', swp) else: common.printWarning('*Warning*: host(' + str(host) + ') swp info "' + str(swp) + '": unrecognized unit, reset it to "0".') swp = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(swp)) self.hostsTabTable.setItem(i, j, item) j = j + 1 index = lshostsDic['HOST_NAME'].index(host) maxswp = lshostsDic['maxswp'][index] if re.search('M', maxswp): maxswp = re.sub('M', '', maxswp) maxswp = int(maxswp) / 1024 elif re.search('G', maxswp): maxswp = re.sub('G', '', maxswp) else: common.printWarning('*Warning*: host(' + str(host) + ') maxswp info "' + str(maxswp) + '": unrecognized unit, reset it to "0".') maxswp = 0 item = QTableWidgetItem() item.setData(Qt.DisplayRole, int(maxswp)) self.hostsTabTable.setItem(i, j, item)
def drawJobMemCurve(self, job): """ Draw memory usage curve for specified job. """ jobRangeDic = common.getJobRangeDic([ job, ]) jobRangeList = list(jobRangeDic.keys()) jobRange = jobRangeList[0] self.jobDbFile = str(config.dbPath) + '/job/' + str(jobRange) + '.db' (self.jobDbFileConnectResult, self.jobDbConn) = sqlite3_common.connectDbFile(self.jobDbFile) if self.jobDbFileConnectResult == 'failed': common.printWarning( '*Warning*: Failed on connectiong job database file "' + str(self.jobDbFile) + '".') return runTimeList = [] memList = [] if self.jobFirstLoad: common.printWarning( '*Warning*: It is the first time loading job database, it may cost a little time ...' ) self.jobFirstLoad = False print('Getting history of job memory usage for job "' + str(job) + '".') tableName = 'job_' + str(job) dataDic = sqlite3_common.getSqlTableData(self.jobDbFile, self.jobDbConn, tableName, ['sampleTime', 'mem']) if not dataDic: common.printWarning('*Warning*: job information is missing for "' + str(job) + '".') return else: runTimeList = dataDic['sampleTime'] memList = dataDic['mem'] realRunTimeList = [] realMemList = [] firstRunTime = datetime.datetime.strptime(str( runTimeList[0]), '%Y%m%d_%H%M%S').timestamp() for i in range(len(runTimeList)): runTime = runTimeList[i] currentRunTime = datetime.datetime.strptime( str(runTime), '%Y%m%d_%H%M%S').timestamp() realRunTime = int((currentRunTime - firstRunTime) / 60) realRunTimeList.append(realRunTime) mem = memList[i] if mem == '': mem = '0' realMem = round(int(mem) / 1024, 1) realMemList.append(realMem) memCurveFig = str(config.tmpPath) + '/' + str( self.user) + '_' + str(job) + '.png' jobNum = common.stringToInt(job) print('Save job memory curve as "' + str(memCurveFig) + '".') common.drawPlot(realRunTimeList, realMemList, 'runTime (Minitu)', 'memory (G)', yUnit='G', title='job : ' + str(job), saveName=memCurveFig, figureNum=jobNum) if self.jobDbFileConnectResult == 'passed': self.jobDbConn.close()
def drawQueueJobNumCurve(self, queue): """ Draw (PEND/RUN) job number curve for specified queue. """ if self.queueDbFileConnectResult == 'failed': common.printWarning( '*Warning*: Failed on connectiong queue database file "' + str(self.queueDbFile) + '".') return dateList = [] pendList = [] runList = [] tmpPendList = [] tmpRunList = [] if self.queueFirstLoad: common.printWarning( '*Warning*: It is the first time loading queue database, it may cost a little time ...' ) self.queueFirstLoad = False print('Getting history of queue PEND/RUN job number for queue "' + str(queue) + '".') tableName = 'queue_' + str(queue) dataDic = sqlite3_common.getSqlTableData(self.queueDbFile, self.queueDbConn, tableName, ['sampleTime', 'PEND', 'RUN']) if not dataDic: common.printWarning( '*Warning*: queue information is missing for "' + str(queue) + '".') return else: origSampleTimeList = dataDic['sampleTime'] origPendList = dataDic['PEND'] origRunList = dataDic['RUN'] for i in range(len(origSampleTimeList)): sampleTime = origSampleTimeList[i] date = re.sub('_.*', '', sampleTime) pendNum = origPendList[i] runNum = origRunList[i] if (i != 0) and ((i == len(origSampleTimeList) - 1) or (date not in dateList)): pendAvg = int(sum(tmpPendList) / len(tmpPendList)) pendList.append(pendAvg) runAvg = int(sum(tmpRunList) / len(tmpRunList)) runList.append(runAvg) if date not in dateList: dateList.append(date) tmpPendList = [] tmpRunList = [] tmpPendList.append(int(pendNum)) tmpRunList.append(int(runNum)) # Cut dateList/pendList/runList, only save 15 days result.a if len(dateList) > 15: dateList = dateList[-15:] pendList = pendList[-15:] runList = runList[-15:] if len(dateList) == 0: common.printWarning( '*Warning*: PEND/RUN job number information is missing for queue "' + str(queue) + '".') return else: queueJobNumCurveFig = str(config.tmpPath) + '/' + str( self.user) + '_' + str(queue) + '_jobNum.png' queueNum = common.stringToInt(queue) print('Save queue PEND/RUN job numeber curve as "' + str(queueJobNumCurveFig) + '".') common.drawPlots(dateList, [pendList, runList], 'DATE', 'NUM', ['PEND', 'RUN'], xIsString=True, title='queue : ' + str(queue), saveName=queueJobNumCurveFig, figureNum=queueNum)