class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUtime, memoryLimit=0, systemFlag="linux2.4"): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info("Watchdog already initialized") return S_OK() else: self.initialized = True setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Can not get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self.maxcount = loops self.log.verbose("Watchdog initialization") self.log.info("Attempting to Initialize Watchdog for: %s" % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1) self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1) self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1) self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1) self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0) self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0) self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + "/NullCPUCountLimit", 5 ) # After 5 sample times return null CPU consumption kill job if self.checkingTime < self.minCheckingTime: self.log.info( "Requested CheckingTime of %s setting to %s seconds (minimum)" % (self.checkingTime, self.minCheckingTime) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result["OK"]: self.log.always("Can not start watchdog for the following reason") self.log.always(result["Message"]) return result try: while True: self.log.debug("Starting watchdog loop # %d" % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result["OK"]: self.log.error("Watchdog error during execution", result["Message"]) break elif result["Value"] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR("Exception") ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info("Process to monitor has completed, Watchdog will exit.") return S_OK("Ended") if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = "Job has reached the CPU limit of the queue" self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result["OK"]: self.log.warn("Problem during recent checks") self.log.warn(result["Message"]) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose("------------------------------------") self.log.verbose("Checking loop starts for Watchdog") heartBeatDict = {} msg = "" loadAvg = self.getLoadAverage() if not loadAvg["OK"]: msg += "LoadAvg: ERROR" else: loadAvg = loadAvg["Value"] msg += "LoadAvg: %d " % loadAvg heartBeatDict["LoadAverage"] = loadAvg if not self.parameters.has_key("LoadAverage"): self.parameters["LoadAverage"] = [] self.parameters["LoadAverage"].append(loadAvg) memoryUsed = self.getMemoryUsed() if not memoryUsed["OK"]: msg += "MemUsed: ERROR " else: memoryUsed = memoryUsed["Value"] msg += "MemUsed: %.1f kb " % (memoryUsed) heartBeatDict["MemoryUsed"] = memoryUsed if not self.parameters.has_key("MemoryUsed"): self.parameters["MemoryUsed"] = [] self.parameters["MemoryUsed"].append(memoryUsed) result = self.processMonitor.getMemoryConsumed(self.wrapperPID) if result["OK"]: vsize = result["Value"]["Vsize"] / 1024.0 rss = result["Value"]["RSS"] / 1024.0 heartBeatDict["Vsize"] = vsize heartBeatDict["RSS"] = rss self.parameters.setdefault("Vsize", []) self.parameters["Vsize"].append(vsize) self.parameters.setdefault("RSS", []) self.parameters["RSS"].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result["OK"]: self.log.warn("Could not establish DiskSpace", result["Message"]) else: msg += "DiskSpace: %.1f MB " % (result["Value"]) if not self.parameters.has_key("DiskSpace"): self.parameters["DiskSpace"] = [] if result["OK"]: self.parameters["DiskSpace"].append(result["Value"]) heartBeatDict["AvailableDiskSpace"] = result["Value"] cpu = self.__getCPU() if not cpu["OK"]: msg += "CPU: ERROR " hmsCPU = 0 else: cpu = cpu["Value"] msg += "CPU: %s (h:m:s) " % (cpu) if not self.parameters.has_key("CPUConsumed"): self.parameters["CPUConsumed"] = [] self.parameters["CPUConsumed"].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: heartBeatDict["CPUConsumed"] = rawCPU["Value"] result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) else: msg += "WallClock: %.2f s " % (result["Value"]) self.parameters.setdefault("WallClockTime", list()).append(result["Value"]) heartBeatDict["WallClockTime"] = result["Value"] self.log.info(msg) result = self._checkProgress() if not result["OK"]: self.checkError = result["Message"] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) self.log.info("Last %s lines of available application output:" % (size)) self.log.info("================START================") for line in outputList: self.log.info(line) self.log.info("=================END=================") self.__killRunningThread() return S_OK() recentStdOut = "None" if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % ( size, Time.dateTime(), ) border = "=" * len(recentStdOut) cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU) if self.timeLeft: cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + "\n" else: recentStdOut = ( "Watchdog is initializing and will attempt to obtain standard output from application thread" ) self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn("Turning off job peeking for remainder of execution") if not os.environ.has_key("JOBID"): self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] staticParamDict = {"StandardOutput": recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK("Watchdog checking cycle complete") ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: cpuTime = self.processMonitor.getCPUConsumed(self.wrapperPID) if not cpuTime["OK"]: self.log.warn("Problem while checking consumed CPU") return cpuTime cpuTime = cpuTime["Value"] if cpuTime: self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTime)) return self.__getCPUHMS(cpuTime) else: self.log.error("CPU time consumed found to be 0") return S_ERROR() except Exception, e: self.log.warn("Could not determine CPU time consumed with exception") self.log.exception(e) return S_ERROR("Could not determine CPU time consumed with exception")
class Watchdog: ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUtime, memoryLimit=0, systemFlag='linux2.4'): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5) # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.grossTimeLeftLimit = 10 * self.checkingTime self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: gLogger.always('Can not start watchdog for the following reason') gLogger.always(result['Message']) return result try: while True: gLogger.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: gLogger.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: gLogger.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): # print self.parameters self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self.__performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def __performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' result = self.getLoadAverage() msg += 'LoadAvg: %s ' % (result['Value']) heartBeatDict['LoadAverage'] = result['Value'] if not self.parameters.has_key('LoadAverage'): self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(result['Value']) result = self.getMemoryUsed() msg += 'MemUsed: %.1f kb ' % (result['Value']) heartBeatDict['MemoryUsed'] = result['Value'] if not self.parameters.has_key('MemoryUsed'): self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(result['Value']) result = self.processMonitor.getMemoryConsumed(self.wrapperPID) if result['OK']: vsize = result['Value']['Vsize'] / 1024. rss = result['Value']['RSS'] / 1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() msg += 'DiskSpace: %.1f MB ' % (result['Value']) if not self.parameters.has_key('DiskSpace'): self.parameters['DiskSpace'] = [] self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] result = self.__getCPU() msg += 'CPU: %s (h:m:s) ' % (result['Value']) if not self.parameters.has_key('CPUConsumed'): self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(result['Value']) hmsCPU = result['Value'] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters['WallClockTime'].append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self.__checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info( 'Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution') if not os.environ.has_key('JOBID'): self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ cpuTime = '00:00:00' try: cpuTime = self.processMonitor.getCPUConsumed(self.wrapperPID) except Exception: self.log.warn( 'Could not determine CPU time consumed with exception') self.log.exception() return S_OK(cpuTime) # just return null CPU if not cpuTime['OK']: self.log.warn('Problem while checking consumed CPU') self.log.warn(cpuTime) return S_OK('00:00:00') # again return null CPU in this case cpuTime = cpuTime['Value'] self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTime)) result = self.__getCPUHMS(cpuTime) return result ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = '%02d:%02d:%02d' % (hours, mins, secs) self.log.verbose('Human readable CPU time is: %s' % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info('Received control signal') if type(signalDict) == type({}): if signalDict.has_key('Kill'): self.log.info( 'Received Kill signal, stopping job via control signal') self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info( 'The following control signal was sent but not understood by the watchdog:' ) self.log.info(signalDict) else: self.log.info( 'Expected dictionary for control signal, received:\n%s' % (signalDict)) return S_OK() ############################################################################# def __checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() report += 'WallClock: OK, ' if not result['OK']: self.log.warn(result['Message']) return result else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() report += 'DiskSpace: OK, ' if not result['OK']: self.log.warn(result['Message']) return result else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() report += 'LoadAverage: OK, ' if not result['OK']: self.log.warn(result['Message']) return result else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() report += 'CPUConsumed: OK, ' if not result['OK']: return result else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() report += 'CPULimit OK, ' if not result['OK']: self.log.warn(result['Message']) return result else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() report += 'MemoryLimit OK, ' if not result['OK']: self.log.warn(result['Message']) return result else: report += 'MemoryLimit: NA, ' self.log.info(report) return S_OK('All enabled checks passed') ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if 'WallClockTime' not in self.parameters: return S_ERROR('Missing WallClockTime info') if 'CPUConsumed' not in self.parameters: return S_ERROR('Missing CPUConsumed info') wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smalled than sample time (%s)" % (wallClockTime, self.sampleCPUTime)) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters['CPUConsumed']) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters['CPUConsumed']), intervals + 1)) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters[ 'WallClockTime'][-1 - intervals] try: cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1])['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals])['Value'] if cpuTime < 0: self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' ) return S_OK() if wallClockTime <= 0: self.log.warn( 'Wallclock time should not be negative or zero, Ignore') return S_OK() ratio = (cpuTime / wallClockTime) * 100. self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'): self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' ) return S_OK() self.log.info("Job is stalled!") return S_ERROR('Watchdog identified this job as stalled') except Exception, e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK()
class Watchdog( object ): ############################################################################# def __init__( self, pid, exeThread, spObject, jobCPUtime, memoryLimit = 0, systemFlag = 'linux2.4' ): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger( "Watchdog" ) self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 #defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False ############################################################################# def initialize( self, loops = 0 ): """ Watchdog initialization. """ if self.initialized: self.log.info( 'Watchdog already initialized' ) return S_OK() else: self.initialized = True setup = gConfig.getValue( '/DIRAC/Setup', '' ) if not setup: return S_ERROR( 'Can not get the DIRAC Setup value' ) wms_instance = getSystemInstance( "WorkloadManagement" ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose( 'Watchdog initialization' ) self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) ) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 ) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 ) self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 ) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 ) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 ) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 ) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 ) # Other parameters self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 ) # 10 seconds self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 ) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 ) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 ) # e.g. 4 days self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 ) # on / off self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 ) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 ) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 ) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 ) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 ) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 ) # After 5 sample times return null CPU consumption kill job if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime ) return S_OK() def run( self ): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always( 'Can not start watchdog for the following reason' ) self.log.always( result['Message'] ) return result try: while True: self.log.debug( 'Starting watchdog loop # %d' % self.count ) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result[ 'OK' ]: self.log.error( "Watchdog error during execution", result[ 'Message' ] ) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep( self.pollingTime - exec_cycle_time ) return S_OK() except Exception: self.log.exception() return S_ERROR( 'Exception' ) ############################################################################# def execute( self ): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.' ) return S_OK( "Ended" ) if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error( self.checkError, self.timeLeft ) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if ( time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn( 'Problem during recent checks' ) self.log.warn( result['Message'] ) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performChecks( self ): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose( '------------------------------------' ) self.log.verbose( 'Checking loop starts for Watchdog' ) heartBeatDict = {} msg = '' loadAvg = self.getLoadAverage() if not loadAvg['OK']: msg += 'LoadAvg: ERROR' else: loadAvg = loadAvg['Value'] msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if not self.parameters.has_key( 'LoadAverage' ): self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append( loadAvg ) memoryUsed = self.getMemoryUsed() if not memoryUsed['OK']: msg += 'MemUsed: ERROR ' else: memoryUsed = memoryUsed['Value'] msg += 'MemUsed: %.1f kb ' % ( memoryUsed ) heartBeatDict['MemoryUsed'] = memoryUsed if not self.parameters.has_key( 'MemoryUsed' ): self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append( memoryUsed ) result = self.processMonitor.getMemoryConsumed( self.wrapperPID ) if result['OK']: vsize = result['Value']['Vsize']/1024. rss = result['Value']['RSS']/1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault( 'Vsize', [] ) self.parameters['Vsize'].append( vsize ) self.parameters.setdefault( 'RSS', [] ) self.parameters['RSS'].append( rss ) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn( "Could not establish DiskSpace", result['Message'] ) else: msg += 'DiskSpace: %.1f MB ' % ( result['Value'] ) if not self.parameters.has_key( 'DiskSpace' ): self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append( result['Value'] ) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % ( cpu ) if not self.parameters.has_key( 'CPUConsumed' ): self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append( cpu ) hmsCPU = cpu rawCPU = self.__convertCPUTime( hmsCPU ) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn( "Failed determining wall clock time", result['Message'] ) else: msg += 'WallClock: %.2f s ' % ( result['Value'] ) self.parameters.setdefault( 'WallClockTime', list() ).append( result['Value'] ) heartBeatDict['WallClockTime'] = result['Value'] self.log.info( msg ) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn( self.checkError ) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len( outputList ) self.log.info( 'Last %s lines of available application output:' % ( size ) ) self.log.info( '================START================' ) for line in outputList: self.log.info( line ) self.log.info( '=================END=================' ) self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len( outputList ) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime() ) border = '=' * len( recentStdOut ) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU ) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % ( border, recentStdOut, cpuTotal, border ) self.log.info( recentStdOut ) for line in outputList: self.log.info( line ) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info( recentStdOut ) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution' ) if not os.environ.has_key( 'JOBID' ): self.log.info( 'Running without JOBID so parameters will not be reported' ) return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput':recentStdOut} self.__sendSignOfLife( int( jobID ), heartBeatDict, staticParamDict ) return S_OK( 'Watchdog checking cycle complete' ) ############################################################################# def __getCPU( self ): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: cpuTime = self.processMonitor.getCPUConsumed( self.wrapperPID ) if not cpuTime['OK']: self.log.warn( 'Problem while checking consumed CPU' ) return cpuTime cpuTime = cpuTime['Value'] if cpuTime: self.log.verbose( "Raw CPU time consumed (s) = %s" % ( cpuTime ) ) return self.__getCPUHMS( cpuTime ) else: self.log.error( "CPU time consumed found to be 0" ) return S_ERROR() except Exception as e: self.log.warn( 'Could not determine CPU time consumed with exception' ) self.log.exception( e ) return S_ERROR( "Could not determine CPU time consumed with exception" ) ############################################################################# def __getCPUHMS( self, cpuTime ): mins, secs = divmod( cpuTime, 60 ) hours, mins = divmod( mins, 60 ) humanTime = '%02d:%02d:%02d' % ( hours, mins, secs ) self.log.verbose( 'Human readable CPU time is: %s' % humanTime ) return S_OK( humanTime ) ############################################################################# def __interpretControlSignal( self, signalDict ): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info( 'Received control signal' ) if type( signalDict ) == type( {} ): if signalDict.has_key( 'Kill' ): self.log.info( 'Received Kill signal, stopping job via control signal' ) self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info( 'The following control signal was sent but not understood by the watchdog:' ) self.log.info( signalDict ) else: self.log.info( 'Expected dictionary for control signal, received:\n%s' % ( signalDict ) ) return S_OK() ############################################################################# def _checkProgress( self ): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message'] ) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info( report ) return S_OK( 'All enabled checks passed' ) ############################################################################# def __checkCPUConsumed( self ): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info( "Checking CPU Consumed" ) if 'WallClockTime' not in self.parameters: return S_ERROR( 'Missing WallClockTime info' ) if 'CPUConsumed' not in self.parameters: return S_ERROR( 'Missing CPUConsumed info' ) wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime, self.sampleCPUTime ) ) return S_OK() intervals = max( 1, int( self.sampleCPUTime / self.checkingTime ) ) if len( self.parameters['CPUConsumed'] ) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % ( len( self.parameters['CPUConsumed'] ), intervals + 1 ) ) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals ] try: cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1] )['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals ] )['Value'] if cpuTime < 0: self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' ) return S_OK() if wallClockTime <= 0: self.log.warn( 'Wallclock time should not be negative or zero, Ignore' ) return S_OK() ratio = ( cpuTime / wallClockTime ) * 100. self.log.info( "CPU/Wallclock ratio is %.2f%%" % ratio ) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists( 'DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK' ): self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' ) return S_OK() self.log.info( "Job is stalled!" ) return S_ERROR( 'Watchdog identified this job as stalled' ) except Exception as e: self.log.error( "Cannot convert CPU consumed from string to int", str( e ) ) return S_OK() ############################################################################# def __convertCPUTime( self, cputime ): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split( ':' ) # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float( cpuHMS[0] ) * 60 * 60 mins = float( cpuHMS[1] ) * 60 secs = float( cpuHMS[2] ) cpuValue = float( hours + mins + secs ) except Exception as x: self.log.warn( str( x ) ) return S_ERROR( 'Could not calculate CPU time' ) # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug( 'CPU value %s converted to %s' % ( cputime, normalizedCPUValue ) ) return result ############################################################################# def __checkCPULimit( self ): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if self.parameters.has_key( 'CPUConsumed' ): consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime( consumedCPU ) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK( 'Not possible to determine current CPU consumed' ) if consumedCPU: limit = self.jobCPUtime + self.jobCPUtime * ( self.jobCPUMargin / 100 ) cpuConsumed = float( currentCPU ) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % ( self.jobCPUMargin ) ) return S_ERROR( 'Job has exceeded maximum CPU time limit' ) else: return S_OK( 'Job within CPU limit' ) elif not currentCPU: self.log.verbose( 'Both initial and current CPU consumed are null' ) return S_OK( 'CPU consumed is not measurable yet' ) else: return S_OK( 'Not possible to determine CPU consumed' ) def __checkMemoryLimit( self ): """ Checks that the job memory consumption is within a limit """ if self.parameters.has_key( 'Vsize' ): vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % ( vsize, self.memoryLimit ) ) return S_OK() ############################################################################# def __checkDiskSpace( self ): """Checks whether the CS defined minimum disk space is available. """ if self.parameters.has_key( 'DiskSpace' ): availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % ( self.minDiskSpace ) ) return S_ERROR( 'Job has insufficient disk space to continue' ) else: return S_OK( 'Job has enough disk space available' ) else: return S_ERROR( 'Available disk space could not be established' ) ############################################################################# def __checkWallClockTime( self ): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if self.initialValues.has_key( 'StartTime' ): startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % ( self.maxWallClockTime ) ) return S_ERROR( 'Job has exceeded maximum wall clock time' ) else: return S_OK( 'Job within maximum wall clock time' ) else: return S_ERROR( 'Job start time could not be established' ) ############################################################################# def __checkLoadAverage( self ): """Checks whether the CS defined maximum load average is exceeded. """ if self.parameters.has_key( 'LoadAverage' ): loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float( self.loadAvgLimit ): self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % ( self.loadAvgLimit ) ) return S_ERROR( 'Job exceeded maximum load average' ) else: return S_OK( 'Job running with normal load average' ) else: return S_ERROR( 'Job load average not established' ) ############################################################################# def __peek( self ): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn( 'Could not obtain output from running application thread' ) self.log.warn( result['Message'] ) return result ############################################################################# def calibrate( self ): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn( "Could not establish CPU consumed, setting to 0.0" ) cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] loadAvg = self.getLoadAverage() if not loadAvg['OK']: self.log.warn( "Could not establish LoadAverage, setting to 0" ) loadAvg = 0 else: loadAvg = loadAvg['Value'] self.initialValues['LoadAverage'] = loadAvg self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() if not memUsed['OK']: self.log.warn( "Could not establish MemoryUsed, setting to 0" ) memUsed = 0 else: memUsed = memUsed['Value'] self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.processMonitor.getMemoryConsumed( self.wrapperPID ) self.log.verbose( 'Job Memory: %s' % ( result['Value'] ) ) if not result['OK']: self.log.warn( 'Could not get job memory usage' ) self.initialValues['Vsize'] = result['Value']['Vsize']/1024. self.initialValues['RSS'] = result['Value']['RSS']/1024. self.parameters['Vsize'] = [] self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose( 'DiskSpace: %s' % ( result ) ) if not result['OK']: self.log.warn( "Could not establish DiskSpace" ) self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose( 'NodeInfo: %s' % ( result ) ) if not result['OK']: self.log.warn( "Could not establish static system information" ) if os.environ.has_key( 'LSB_JOBID' ): result['LocalJobID'] = os.environ['LSB_JOBID'] if os.environ.has_key( 'PBS_JOBID' ): result['LocalJobID'] = os.environ['PBS_JOBID'] if os.environ.has_key( 'QSUB_REQNAME' ): result['LocalJobID'] = os.environ['QSUB_REQNAME'] if os.environ.has_key( 'JOB_ID' ): result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters( result, 'NodeInformation', True ) self.__reportParameters( self.initialValues, 'InitialValues' ) return S_OK() def __timeLeft( self ): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft( 0.0 ) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft ) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary( self ): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if self.parameters.has_key( 'CPUConsumed' ): cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime( hmsCPU ) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if self.parameters.has_key( 'DiskSpace' ): space = self.parameters['DiskSpace'] if space: value = abs( float( space[-1] ) - float( self.initialValues['DiskSpace'] ) ) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if self.parameters.has_key( 'MemoryUsed' ): memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs( float( memory[-1] ) - float( self.initialValues['MemoryUsed'] ) ) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if self.parameters.has_key( 'LoadAverage' ): laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float( sum( laList ) ) / float( len( laList ) ) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn( "Failed determining wall clock time", result['Message'] ) summary['WallClockTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock self.__reportParameters( summary, 'UsageSummary', True ) self.currentStats = summary ############################################################################# def __reportParameters( self, params, title = None, report = False ): """Will report parameters for job. """ try: parameters = [] self.log.info( '==========================================================' ) if title: self.log.info( 'Watchdog will report %s' % ( title ) ) else: self.log.info( 'Watchdog will report parameters' ) self.log.info( '==========================================================' ) vals = params if params.has_key( 'Value' ): if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info( str( k ) + ' = ' + str( v ) ) parameters.append( ( k, v ) ) if report: self.__setJobParamList( parameters ) self.log.info( '==========================================================' ) except Exception as x: self.log.warn( 'Problem while reporting parameters' ) self.log.warn( str( x ) ) ############################################################################# def __getWallClockTime( self ): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if self.initialValues.has_key( 'StartTime' ): currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread( self ): """ Will kill the running thread process and any child processes.""" self.log.info( 'Sending kill signal to application PID %s' % ( self.spObject.getChildPID() ) ) result = self.spObject.killChild() self.applicationKilled = True self.log.info( 'Subprocess.killChild() returned:%s ' % ( result ) ) return S_OK( 'Thread killed' ) ############################################################################# def __sendSignOfLife( self, jobID, heartBeatDict, staticParamDict ): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 ) result = jobReport.sendHeartBeat( jobID, heartBeatDict, staticParamDict ) if not result['OK']: self.log.warn( 'Problem sending sign of life' ) self.log.warn( result ) if result['OK'] and result['Value']: self.__interpretControlSignal( result['Value'] ) return result ############################################################################# def __setJobParamList( self, value ): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if not os.environ.has_key( 'JOBID' ): self.log.info( 'Running without JOBID so parameters will not be reported' ) return S_OK() jobID = os.environ['JOBID'] jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 ) jobParam = jobReport.setJobParameters( int( jobID ), value ) self.log.verbose( 'setJobParameters(%s,%s)' % ( jobID, value ) ) if not jobParam['OK']: self.log.warn( jobParam['Message'] ) return jobParam ############################################################################# def getNodeInformation( self ): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getLoadAverage( self ): """ Attempts to get the load average, should be overridden in a subclass""" methodName = 'getLoadAverage' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getMemoryUsed( self ): """ Attempts to get the memory used, should be overridden in a subclass""" methodName = 'getMemoryUsed' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getDiskSpace( self ): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
class Watchdog: ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUtime, memoryLimit=0, systemFlag="linux2.4"): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info("Watchdog already initialized") return S_OK() else: self.initialized = True setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Can not get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self.maxcount = loops self.log.verbose("Watchdog initialization") self.log.info("Attempting to Initialize Watchdog for: %s" % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1) self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1) self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1) self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1) self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0) self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0) self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + "/NullCPUCountLimit", 5 ) # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 if self.checkingTime < self.minCheckingTime: self.log.info( "Requested CheckingTime of %s setting to %s seconds (minimum)" % (self.checkingTime, self.minCheckingTime) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.grossTimeLeftLimit = 10 * self.checkingTime self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime) self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result["OK"]: gLogger.always("Can not start watchdog for the following reason") gLogger.always(result["Message"]) return result try: while True: gLogger.debug("Starting watchdog loop # %d" % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result["OK"]: gLogger.error("Watchdog error during execution", result["Message"]) break elif result["Value"] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: gLogger.exception() return S_ERROR("Exception") ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): # print self.parameters self.__getUsageSummary() self.log.info("Process to monitor has completed, Watchdog will exit.") return S_OK("Ended") if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = "Job has reached the CPU limit of the queue" self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self.__performChecks() if not result["OK"]: self.log.warn("Problem during recent checks") self.log.warn(result["Message"]) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def __performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose("------------------------------------") self.log.verbose("Checking loop starts for Watchdog") heartBeatDict = {} msg = "" result = self.getLoadAverage() msg += "LoadAvg: %s " % (result["Value"]) heartBeatDict["LoadAverage"] = result["Value"] if not self.parameters.has_key("LoadAverage"): self.parameters["LoadAverage"] = [] self.parameters["LoadAverage"].append(result["Value"]) result = self.getMemoryUsed() msg += "MemUsed: %.1f kb " % (result["Value"]) heartBeatDict["MemoryUsed"] = result["Value"] if not self.parameters.has_key("MemoryUsed"): self.parameters["MemoryUsed"] = [] self.parameters["MemoryUsed"].append(result["Value"]) result = self.processMonitor.getMemoryConsumed(self.wrapperPID) if result["OK"]: vsize = result["Value"]["Vsize"] / 1024.0 rss = result["Value"]["RSS"] / 1024.0 heartBeatDict["Vsize"] = vsize heartBeatDict["RSS"] = rss self.parameters.setdefault("Vsize", []) self.parameters["Vsize"].append(vsize) self.parameters.setdefault("RSS", []) self.parameters["RSS"].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() msg += "DiskSpace: %.1f MB " % (result["Value"]) if not self.parameters.has_key("DiskSpace"): self.parameters["DiskSpace"] = [] self.parameters["DiskSpace"].append(result["Value"]) heartBeatDict["AvailableDiskSpace"] = result["Value"] result = self.__getCPU() msg += "CPU: %s (h:m:s) " % (result["Value"]) if not self.parameters.has_key("CPUConsumed"): self.parameters["CPUConsumed"] = [] self.parameters["CPUConsumed"].append(result["Value"]) hmsCPU = result["Value"] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: heartBeatDict["CPUConsumed"] = rawCPU["Value"] result = self.__getWallClockTime() msg += "WallClock: %.2f s " % (result["Value"]) self.parameters["WallClockTime"].append(result["Value"]) heartBeatDict["WallClockTime"] = result["Value"] self.log.info(msg) result = self.__checkProgress() if not result["OK"]: self.checkError = result["Message"] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) self.log.info("Last %s lines of available application output:" % (size)) self.log.info("================START================") for line in outputList: self.log.info(line) self.log.info("=================END=================") self.__killRunningThread() return S_OK() recentStdOut = "None" if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % ( size, Time.dateTime(), ) border = "=" * len(recentStdOut) cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU) if self.timeLeft: cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + "\n" else: recentStdOut = ( "Watchdog is initializing and will attempt to obtain standard output from application thread" ) self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn("Turning off job peeking for remainder of execution") if not os.environ.has_key("JOBID"): self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] staticParamDict = {"StandardOutput": recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK("Watchdog checking cycle complete") ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ cpuTime = "00:00:00" try: cpuTime = self.processMonitor.getCPUConsumed(self.wrapperPID) except Exception: self.log.warn("Could not determine CPU time consumed with exception") self.log.exception() return S_OK(cpuTime) # just return null CPU if not cpuTime["OK"]: self.log.warn("Problem while checking consumed CPU") self.log.warn(cpuTime) return S_OK("00:00:00") # again return null CPU in this case cpuTime = cpuTime["Value"] self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTime)) result = self.__getCPUHMS(cpuTime) return result ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = "%02d:%02d:%02d" % (hours, mins, secs) self.log.verbose("Human readable CPU time is: %s" % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info("Received control signal") if type(signalDict) == type({}): if signalDict.has_key("Kill"): self.log.info("Received Kill signal, stopping job via control signal") self.checkError = "Received Kill signal" self.__killRunningThread() else: self.log.info("The following control signal was sent but not understood by the watchdog:") self.log.info(signalDict) else: self.log.info("Expected dictionary for control signal, received:\n%s" % (signalDict)) return S_OK() ############################################################################# def __checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = "" if self.testWallClock: result = self.__checkWallClockTime() report += "WallClock: OK, " if not result["OK"]: self.log.warn(result["Message"]) return result else: report += "WallClock: NA," if self.testDiskSpace: result = self.__checkDiskSpace() report += "DiskSpace: OK, " if not result["OK"]: self.log.warn(result["Message"]) return result else: report += "DiskSpace: NA," if self.testLoadAvg: result = self.__checkLoadAverage() report += "LoadAverage: OK, " if not result["OK"]: self.log.warn(result["Message"]) return result else: report += "LoadAverage: NA," if self.testCPUConsumed: result = self.__checkCPUConsumed() report += "CPUConsumed: OK, " if not result["OK"]: return result else: report += "CPUConsumed: NA, " if self.testCPULimit: result = self.__checkCPULimit() report += "CPULimit OK, " if not result["OK"]: self.log.warn(result["Message"]) return result else: report += "CPULimit: NA, " if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += "TimeLeft: OK" else: report += "TimeLeft: NA" if self.testMemoryLimit: result = self.__checkMemoryLimit() report += "MemoryLimit OK, " if not result["OK"]: self.log.warn(result["Message"]) return result else: report += "MemoryLimit: NA, " self.log.info(report) return S_OK("All enabled checks passed") ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if "WallClockTime" not in self.parameters: return S_ERROR("Missing WallClockTime info") if "CPUConsumed" not in self.parameters: return S_ERROR("Missing CPUConsumed info") wallClockTime = self.parameters["WallClockTime"][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smalled than sample time (%s)" % (wallClockTime, self.sampleCPUTime) ) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters["CPUConsumed"]) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters["CPUConsumed"]), intervals + 1) ) return S_OK() wallClockTime = self.parameters["WallClockTime"][-1] - self.parameters["WallClockTime"][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters["CPUConsumed"][-1])["Value"] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters["CPUConsumed"][-1 - intervals])["Value"] if cpuTime < 0: self.log.warn("Consumed CPU time negative, something wrong may have happened, ignore") return S_OK() if wallClockTime <= 0: self.log.warn("Wallclock time should not be negative or zero, Ignore") return S_OK() ratio = (cpuTime / wallClockTime) * 100.0 self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK"): self.log.info( "N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload" ) return S_OK() self.log.info("Job is stalled!") return S_ERROR("Watchdog identified this job as stalled") except Exception, e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK()
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int( jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5) if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always('Can not start watchdog for the following reason') self.log.always(result['Message']) return result try: while True: self.log.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: self.log.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if not self.stopSigSent and self.stopSigRegex is not None and ( time.time() - self.initialValues['StartTime'] ) > self.wallClockCheckSeconds * self.wallClockCheckCount: self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature('jobstart_secs') if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \ (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline)) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error( 'Failed to send signals to JobWrapper children! (%s)' % str(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' loadAvg = self.getLoadAverage() if not loadAvg['OK']: msg += 'LoadAvg: ERROR' else: loadAvg = loadAvg['Value'] msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if 'LoadAverage' not in self.parameters: self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(loadAvg) memoryUsed = self.getMemoryUsed() if not memoryUsed['OK']: msg += 'MemUsed: ERROR ' else: memoryUsed = memoryUsed['Value'] msg += 'MemUsed: %.1f kb ' % (memoryUsed) heartBeatDict['MemoryUsed'] = memoryUsed if 'MemoryUsed' not in self.parameters: self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(memoryUsed) result = self.processMonitor.getMemoryConsumed(self.wrapperPID) if result['OK']: vsize = result['Value']['Vsize'] / 1024. rss = result['Value']['RSS'] / 1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn("Could not establish DiskSpace", result['Message']) else: msg += 'DiskSpace: %.1f MB ' % (result['Value']) if 'DiskSpace' not in self.parameters: self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % (cpu) if 'CPUConsumed' not in self.parameters: self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) else: msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters.setdefault('WallClockTime', list()).append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info( 'Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution') if 'JOBID' not in os.environ: self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: cpuTime = self.processMonitor.getCPUConsumed(self.wrapperPID) if not cpuTime['OK']: self.log.warn('Problem while checking consumed CPU') return cpuTime cpuTime = cpuTime['Value'] if cpuTime: self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTime)) return self.__getCPUHMS(cpuTime) else: self.log.error("CPU time consumed found to be 0") return S_ERROR() except Exception as e: self.log.warn( 'Could not determine CPU time consumed with exception') self.log.exception(e) return S_ERROR( "Could not determine CPU time consumed with exception") ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = '%02d:%02d:%02d' % (hours, mins, secs) self.log.verbose('Human readable CPU time is: %s' % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info('Received control signal') if isinstance(signalDict, dict): if 'Kill' in signalDict: self.log.info( 'Received Kill signal, stopping job via control signal') self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info( 'The following control signal was sent but not understood by the watchdog:' ) self.log.info(signalDict) else: self.log.info( 'Expected dictionary for control signal, received:\n%s' % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn(result['Message']) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn(result['Message']) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message']) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info(report) return S_OK('All enabled checks passed') ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if 'WallClockTime' not in self.parameters: return S_ERROR('Missing WallClockTime info') if 'CPUConsumed' not in self.parameters: return S_ERROR('Missing CPUConsumed info') wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % (wallClockTime, self.sampleCPUTime)) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters['CPUConsumed']) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters['CPUConsumed']), intervals + 1)) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters[ 'WallClockTime'][-1 - intervals] try: cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1])['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals])['Value'] if cpuTime < 0: self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' ) return S_OK() if wallClockTime <= 0: self.log.warn( 'Wallclock time should not be negative or zero, Ignore') return S_OK() ratio = (cpuTime / wallClockTime) * 100. self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'): self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' ) return S_OK() self.log.info("Job is stalled!") return S_ERROR('Watchdog identified this job as stalled') except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(':') # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR('Could not calculate CPU time') # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if 'CPUConsumed' in self.parameters: consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK('Not possible to determine current CPU consumed') if consumedCPU: limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % (self.jobCPUMargin)) return S_ERROR('Job has exceeded maximum CPU time limit') else: return S_OK('Job within CPU limit') elif not currentCPU: self.log.verbose('Both initial and current CPU consumed are null') return S_OK('CPU consumed is not measurable yet') else: return S_OK('Not possible to determine CPU consumed') def __checkMemoryLimit(self): """ Checks that the job memory consumption is within a limit """ if 'Vsize' in self.parameters: vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit)) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available. """ if 'DiskSpace' in self.parameters: availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace)) return S_ERROR('Job has insufficient disk space to continue') else: return S_OK('Job has enough disk space available') else: return S_ERROR('Available disk space could not be established') ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if 'StartTime' in self.initialValues: startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime)) return S_ERROR('Job has exceeded maximum wall clock time') else: return S_OK('Job within maximum wall clock time') else: return S_ERROR('Job start time could not be established') ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded. """ if 'LoadAverage' in self.parameters: loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float(self.loadAvgLimit): self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit)) return S_ERROR('Job exceeded maximum load average') else: return S_OK('Job running with normal load average') else: return S_ERROR('Job load average not established') ############################################################################# def __peek(self): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn( 'Could not obtain output from running application thread') self.log.warn(result['Message']) return result ############################################################################# def calibrate(self): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] loadAvg = self.getLoadAverage() if not loadAvg['OK']: self.log.warn("Could not establish LoadAverage, setting to 0") loadAvg = 0 else: loadAvg = loadAvg['Value'] self.initialValues['LoadAverage'] = loadAvg self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() if not memUsed['OK']: self.log.warn("Could not establish MemoryUsed, setting to 0") memUsed = 0 else: memUsed = memUsed['Value'] self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.processMonitor.getMemoryConsumed(self.wrapperPID) self.log.verbose('Job Memory: %s' % (result['Value'])) if not result['OK']: self.log.warn('Could not get job memory usage') self.initialValues['Vsize'] = result['Value']['Vsize'] / 1024. self.initialValues['RSS'] = result['Value']['RSS'] / 1024. self.parameters['Vsize'] = [] self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose('DiskSpace: %s' % (result)) if not result['OK']: self.log.warn("Could not establish DiskSpace") self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose('NodeInfo: %s' % (result)) if not result['OK']: self.log.warn("Could not establish static system information") if 'LSB_JOBID' in os.environ: result['LocalJobID'] = os.environ['LSB_JOBID'] if 'PBS_JOBID' in os.environ: result['LocalJobID'] = os.environ['PBS_JOBID'] if 'QSUB_REQNAME' in os.environ: result['LocalJobID'] = os.environ['QSUB_REQNAME'] if 'JOB_ID' in os.environ: result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters(result, 'NodeInformation', True) self.__reportParameters(self.initialValues, 'InitialValues') return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if 'CPUConsumed' in self.parameters: cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if 'DiskSpace' in self.parameters: space = self.parameters['DiskSpace'] if space: value = abs( float(space[-1]) - float(self.initialValues['DiskSpace'])) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if 'MemoryUsed' in self.parameters: memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs( float(memory[-1]) - float(self.initialValues['MemoryUsed'])) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if 'LoadAverage' in self.parameters: laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float(sum(laList)) / float( len(laList)) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) summary['WallClockTime(s)'] = 0 summary['ScaledCPUTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock summary[ 'ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors self.__reportParameters(summary, 'UsageSummary', True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job. """ try: parameters = [] self.log.info( '==========================================================') if title: self.log.info('Watchdog will report %s' % (title)) else: self.log.info('Watchdog will report parameters') self.log.info( '==========================================================') vals = params if 'Value' in params: if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info(str(k) + ' = ' + str(v)) parameters.append((k, v)) if report: self.__setJobParamList(parameters) self.log.info( '==========================================================') except Exception as x: self.log.warn('Problem while reporting parameters') self.log.warn(str(x)) ############################################################################# def __getWallClockTime(self): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if 'StartTime' in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread(self): """ Will kill the running thread process and any child processes.""" self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info('Subprocess.killChild() returned:%s ' % (result)) return S_OK('Thread killed') ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ jobReport = RPCClient('WorkloadManagement/JobStateUpdate', timeout=120) result = jobReport.sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result['OK']: self.log.warn('Problem sending sign of life') self.log.warn(result) if result['OK'] and result['Value']: self.__interpretControlSignal(result['Value']) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if 'JOBID' not in os.environ: self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] jobReport = RPCClient('WorkloadManagement/JobStateUpdate', timeout=120) jobParam = jobReport.setJobParameters(int(jobID), value) self.log.verbose('setJobParameters(%s,%s)' % (jobID, value)) if not jobParam['OK']: self.log.warn(jobParam['Message']) return jobParam ############################################################################# def getNodeInformation(self): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getLoadAverage(self): """ Attempts to get the load average, should be overridden in a subclass""" methodName = 'getLoadAverage' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getMemoryUsed(self): """ Attempts to get the memory used, should be overridden in a subclass""" methodName = 'getMemoryUsed' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getDiskSpace(self): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUtime, memoryLimit=0, systemFlag='linux2.4'): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 #defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5) # After 5 sample times return null CPU consumption kill job if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always('Can not start watchdog for the following reason') self.log.always(result['Message']) return result try: while True: self.log.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: self.log.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' loadAvg = self.getLoadAverage() if not loadAvg['OK']: msg += 'LoadAvg: ERROR' else: loadAvg = loadAvg['Value'] msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if not self.parameters.has_key('LoadAverage'): self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(loadAvg) memoryUsed = self.getMemoryUsed() if not memoryUsed['OK']: msg += 'MemUsed: ERROR ' else: memoryUsed = memoryUsed['Value'] msg += 'MemUsed: %.1f kb ' % (memoryUsed) heartBeatDict['MemoryUsed'] = memoryUsed if not self.parameters.has_key('MemoryUsed'): self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(memoryUsed) result = self.processMonitor.getMemoryConsumed(self.wrapperPID) if result['OK']: vsize = result['Value']['Vsize'] / 1024. rss = result['Value']['RSS'] / 1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn("Could not establish DiskSpace", result['Message']) else: msg += 'DiskSpace: %.1f MB ' % (result['Value']) if not self.parameters.has_key('DiskSpace'): self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % (cpu) if not self.parameters.has_key('CPUConsumed'): self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) else: msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters.setdefault('WallClockTime', list()).append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info( 'Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution') if not os.environ.has_key('JOBID'): self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: cpuTime = self.processMonitor.getCPUConsumed(self.wrapperPID) if not cpuTime['OK']: self.log.warn('Problem while checking consumed CPU') return cpuTime cpuTime = cpuTime['Value'] if cpuTime: self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTime)) return self.__getCPUHMS(cpuTime) else: self.log.error("CPU time consumed found to be 0") return S_ERROR() except Exception, e: self.log.warn( 'Could not determine CPU time consumed with exception') self.log.exception(e) return S_ERROR( "Could not determine CPU time consumed with exception")