def testAKilledJobMonitoring(self): """ _TestAKilledJobMonitoring_ Simulate a job that is killed check that the data sent is correct """ # Get the necessary objects name = 'testC' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName="DataProcessing") report = self.createReport(outcome=1) # Fill the job environment self.setupJobEnvironment(name=name) # Instantiate DBInfo dbInfo = DashboardInfo(job=job, task=task, dashboardUrl='127.0.0.1:8884') # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName="cmsRun1") # Do the step start data = dbInfo.stepStart(step=step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['1_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step=step.data, stepReport=report) self.assertEqual(data['1_ExeEnd'], step.name()) self.assertNotEqual(data['1_ExeExitCode'], 0) self.assertTrue(data['1_ExeWCTime'] >= 0) # Kill the job! data = dbInfo.jobKilled() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertNotEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'].find('killed'), -1) return
def testAKilledJobMonitoring(self): """ _TestAKilledJobMonitoring_ Simulate a job that is killed check that the data sent is correct """ # Get the necessary objects name = 'testC' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport(outcome = 1) # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) dbInfo.addDestination('127.0.0.1', 8884) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['1_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['1_ExeEnd'], step.name()) self.assertNotEqual(data['1_ExeExitCode'], 0) self.assertTrue(data['1_ExeWCTime'] >= 0) # Kill the job! data = dbInfo.jobKilled() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertNotEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'].find('killed'), -1) return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args={}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) dashboardUrl = '%s:%s' % (destHost, str(destPort)) cores = args.get('cores', 0) self.dashboardInfo = DashboardInfo(task, job, dashboardUrl=dashboardUrl, overrideCores=cores) def jobStart(self, task): """ Job start notifier. """ try: self.dashboardInfo.jobStart() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobEnd(self, task): """ Job End notification """ try: self.dashboardInfo.jobEnd() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() try: self.dashboardInfo.stepStart(step=step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None try: self.dashboardInfo.stepEnd(step=step, stepReport=stepReport) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None try: self.dashboardInfo.stepKilled(step=step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobKilled(self, task): """ Killed job notification """ try: self.dashboardInfo.jobKilled() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def periodicUpdate(self): """ Run on the defined intervals. Tell the dashboard info to run the periodic update """ try: self.dashboardInfo.periodicUpdate() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.softTimeOut = None self.hardTimeOut = None self.killFlag = False self.cmsswFile = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args = {}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job self.logPath = logPath self.softTimeOut = args.get('softTimeOut', None) self.hardTimeOut = args.get('hardTimeOut', None) destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) self.dashboardInfo = DashboardInfo(task = task, job = job) if destHost and destPort: logging.info("About to set destination to %s:%s" % (destHost, destPort)) self.dashboardInfo.addDestination(host = destHost, port = destPort) def jobStart(self, task): """ Job start notifier. """ self.dashboardInfo.jobStart() return def jobEnd(self, task): """ Job End notification """ self.dashboardInfo.jobEnd() return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() self.dashboardInfo.stepStart(step = step) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.dashboardInfo.stepEnd(step = step, stepReport = stepReport) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None self.dashboardInfo.stepKilled(step = step) def jobKilled(self, task): """ Killed job notification """ self.dashboardInfo.jobKilled() return def periodicUpdate(self): """ Run on the defined intervals. """ if not self.currentStep: #We're probably between steps return self.dashboardInfo.periodicUpdate() #Check for events if self.cmsswFile: run, event = searchForEvent(file) if run and event: #Then we actually found something, otherwise do nothing #Right now I don't know what to do pass #Do timeout if not self.softTimeOut: return if time.time() - self.startTime > self.softTimeOut: #Then we have to kill the process # If our stepName is None, we're inbetween steps. Nothing to kill! if self.currentStepName == None: return # If our stepName is valid, then we may need the stepSpace if self.currentStepSpace == None: self.currentStepSpace = getStepSpace(self.currentStepName) #First, get the PID stepPID = getStepPID(self.currentStepSpace, self.currentStepName) #Now kill it! msg = "" msg += "Start Time: %s\n" % self.startTime msg += "Time Now: %s\n" % time.time() msg += "Timeout: %s\n" % self.softTimeOut msg += "Killing Job...\n" msg += "Process ID is: %s\n" % stepPID # If possible, write a FWJR report = Report.Report() try: self.logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) if os.path.isfile(self.logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(self.logPath) report.addError(stepName = self.currentStepName, exitCode = 99901, errorType = "JobTimeout", errorDetails = msg) report.save(self.logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) if stepPID == None or stepPID == os.getpid(): # Then we are supposed to kill things # that don't exist in separate processes: # Self-terminate msg += "WARNING: No separate process. Watchdog will attempt self-termination." logging.error(msg) os.abort() if time.time() - self.startTime < self.hardTimeOut or not self.killFlag: msg += "WARNING: Soft Kill Timeout has Expired:" logging.error(msg) os.kill(stepPID, signal.SIGUSR2) self.killFlag = True elif self.killFlag: msg += "WARNING: Hard Kill Timeout has Expired:" logging.error(msg) os.kill(stepPID, signal.SIGTERM) killedpid, stat = os.waitpid(stepPID, os.WNOHANG) if killedpid == 0: os.kill(stepPID, signal.SIGKILL) killedpid, stat = os.waitpid(stepPID, os.WNOHANG) if killedpid == 0: logging.error("Can't kill job. Out of options. Waiting for system reboot.") #Panic! It's unkillable! return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args = {}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) dashboardUrl = '%s:%s' % (destHost, str(destPort)) cores = args.get('cores', 0) self.dashboardInfo = DashboardInfo(task, job, dashboardUrl=dashboardUrl, overrideCores=cores) def jobStart(self, task): """ Job start notifier. """ try: self.dashboardInfo.jobStart() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobEnd(self, task): """ Job End notification """ try: self.dashboardInfo.jobEnd() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() try: self.dashboardInfo.stepStart(step = step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None try: self.dashboardInfo.stepEnd(step = step, stepReport = stepReport) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None try: self.dashboardInfo.stepKilled(step = step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobKilled(self, task): """ Killed job notification """ try: self.dashboardInfo.jobKilled() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def periodicUpdate(self): """ Run on the defined intervals. Tell the dashboard info to run the periodic update """ try: self.dashboardInfo.periodicUpdate() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return