def __call__(self, errCode, executor, **args): logging.critical("%s Diagnostic Handler invoked", self.__class__.__name__) msg = "Error in CMSSW: %s\n" % (errCode) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) excepInst = args.get('ExceptionInstance', None) description = "Misc. CMSSW error" if excepInst: if hasattr(excepInst, 'detail'): description = excepInst.detail msg += str(excepInst) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep(executor.stepName) reportStep.status = errCode # Grab stderr log from CMSSW errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.stepName)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.stepName)) if os.path.exists(errLog): logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail if os.path.exists(outLog): logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # If it exists, grab the SCRAM log errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log') if os.path.exists(errLog): logTail = FileTools.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += logTail # make sure the report has the error in it dummy = getattr(executor.report.report, "errors", None) # Seems to do nothing executor.report.addError(executor.stepName, errCode, description, msg) return
def __call__(self, errCode, executor, **args): """ Added for Steve to handle SCRAM script failure Must fail job (since SCRAM didn't run) """ msg = "SCRAM scripts failed to run!\n" if args.get('ExceptionInstance', False): msg += str(args.get('ExceptionInstance')) jobReport = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) errLog = os.path.join(os.path.dirname(jobReport), 'scramOutput.log') if os.path.exists(errLog): logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of SCRAM error log:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail executor.report.addError(executor.stepName, 50513, "SCRAMScriptFailure", msg) # Then mark the job as failed if executor.report.report.status == 0: executor.report.report.status = 1
def __call__(self, errCode, executor, **args): print("%s Diagnostic Handler invoked" % self.__class__.__name__) msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: self.parse(executor, jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep(executor.stepName) reportStep.status = self.code errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.stepName)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.stepName)) if os.path.exists(errLog): logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail if os.path.exists(outLog): logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection is None: executor.report.addError(executor.stepName, self.code, self.desc, msg) else: if not hasattr(errSection, self.desc): executor.report.addError(executor.stepName, self.code, self.desc, msg) print(executor.report.report.errors) return
def __call__(self, errCode, executor, **args): print("%s Diagnostic Handler invoked" % self.__class__.__name__) msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep(executor.stepName) reportStep.status = self.code errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.stepName)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.stepName)) if os.path.exists(errLog): logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail if os.path.exists(outLog): logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection == None: executor.report.addError(executor.stepName, self.code, self.desc, msg) else: if not hasattr(errSection, self.desc): executor.report.addError(executor.stepName, self.code, self.desc, msg) print(executor.report.report.errors) return
def test_tail(self): """ _tail_ Can we tail a file? """ a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n" f = open('tmpfile.tmp', 'w') f.write(a) f.close() self.assertEqual(FileTools.tail('tmpfile.tmp', 10), "g\nh\ni\nj\nk\nl\nm\nn\no\np\n") self.assertEqual(FileTools.tail('tmpfile.tmp', 2), "o\np\n") os.remove('tmpfile.tmp') return
def __call__(self, errCode, executor, **args): """ _operator()_ Look for the XML job report, try and read it and extract the error information from it """ jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.stepName)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.stepName)) addOn = '\n' if os.path.exists(errLog): logTail = FileTools.tail(errLog, 10) addOn += '\nAdding last ten lines of CMSSW stderr:\n' addOn += logTail else: logging.error("No stderr from CMSSW") logging.error(os.listdir(os.path.basename(jobRepXml))) if os.path.exists(outLog): logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg = '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # Add the error we were sent ex = args.get('ExceptionInstance', None) executor.report.addError(executor.stepName, errCode, "CMSSWStepFailure", msg + str(ex)) if not os.path.exists(jobRepXml): # no report => Error msg = "No Job Report Found: %s" % jobRepXml executor.report.addError(executor.stepName, 50115, "MissingJobReport", msg) return # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file # just go on pass # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection == None: msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode msg += addOn executor.report.addError(executor.stepName, 50116, "MissingErrorReport", msg) return else: # check exit code in report is non zero if executor.report.report.status == 0: msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode msg += addOn executor.report.addError(executor.stepName, 50116, "MissingErrorReport", msg) else: msg = "Adding extra error in order to hold error report" msg += addOn executor.report.addError(executor.stepName, 99999, "ErrorLoggingAddition", msg) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = FileTools.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return