def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = getStepSpace(stepHelper.name()) if not stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return
def emulateAll(self): """ _emulateAll_ Traverse all Steps and load up the default Emulator based on type. """ for step in self.task.steps().nodeIterator(): helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() self.addEmulator(stepName, stepType)
def emulateAll(self): """ _emulateAll_ Traverse all Steps and load up the default Emulator based on type. """ for step in self.task.steps().nodeIterator(): helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() self.addEmulator(stepName, stepType)
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() #self.grabCommand = "ps -u %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd |grep %s |grep -v grep" self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww |grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.logPath = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args = {}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.logPath = os.path.join(logPath) return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step = "PerformanceError"): report.addStep(reportname = "PerformanceError") report.addError(stepName = "PerformanceError", exitCode = 99900, errorType = "PerformanceKill", errorDetails = msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: logging.error("Attempting to kill job using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) except Exception: os.kill(stepPID, signal.SIGTERM) return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i" self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.pcpu = [] self.pmem = [] self.maxPSS = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.killRetry = False # will trigger a hard (SIGTERM) instead of soft kill self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxPSS = args.get('maxPSS', args.get('maxRSS')) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args={}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: rss = int(output[2]) // 1024 # convert it to MiB vsize = int(output[3]) // 1024 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if result is not None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) failureUpstream = False for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if failureUpstream: # for chained steps, don't execute further steps if a # failure has already happened helper.addOverride("previousCmsRunFailure", True) executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) logging.info("StepName: %s, StepType: %s, with result: %r", stepName, stepType, result) if result: # can be either None, or the step exit code failureUpstream = True except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if not result == None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return