def test_tail(self): """ _tail_ Can we tail a file? """ a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n" f = open('tmpfile.tmp', 'w') f.write(a) f.close() self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 10), ['g\n', 'h\n', 'i\n', 'j\n', 'k\n', 'l\n', 'm\n', 'n\n', 'o\n', 'p\n']) self.assertEqual(BasicAlgos.tail('tmpfile.tmp', 2), ['o\n', 'p\n']) os.remove('tmpfile.tmp') return
def __call__(self, errCode, executor, **args): logging.critical("%s Diagnostic Handler invoked", self.__class__.__name__) msg = "Error in CMSSW: %s\n" % (errCode) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) excepInst = args.get('ExceptionInstance', None) description = "Misc. CMSSW error" if excepInst: if hasattr(excepInst, 'detail'): description = excepInst.detail msg += str(excepInst) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep( executor.step._internal_name) reportStep.status = errCode # Grab stderr log from CMSSW errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail if os.path.exists(outLog): logTail = BasicAlgos.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # If it exists, grab the SCRAM log errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log') if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += logTail # make sure the report has the error in it dummy = getattr(executor.report.report, "errors", None) # Seems to do nothing executor.report.addError(executor.step._internal_name, errCode, description, msg) return
def __call__(self, errCode, executor, **args): logging.critical("%s Diagnostic Handler invoked", self.__class__.__name__) msg = "Error in CMSSW: %s\n" % (errCode) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) excepInst = args.get('ExceptionInstance', None) description = "Misc. CMSSW error" if excepInst: if hasattr(excepInst, 'detail'): description = excepInst.detail msg += str(excepInst) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep(executor.step._internal_name) reportStep.status = errCode # Grab stderr log from CMSSW errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail if os.path.exists(outLog): logTail = BasicAlgos.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG) msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG msg += logTail # If it exists, grab the SCRAM log errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log') if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += logTail # make sure the report has the error in it dummy = getattr(executor.report.report, "errors", None) # Seems to do nothing executor.report.addError(executor.step._internal_name, errCode, description, msg) return
def __call__(self, errCode, executor, **args): logging.critical("%s Diagnostic Handler invoked" % self.__class__.__name__) msg = "Error in CMSSW: %s\n" % (errCode) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) excepInst = args.get('ExceptionInstance', None) description = "Misc. CMSSW error" if excepInst: if hasattr(excepInst, 'detail'): description = excepInst.detail msg += str(excepInst) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it executor.report.parse(jobRepXml) reportStep = executor.report.retrieveStep( executor.step._internal_name) reportStep.status = errCode # Grab stderr log from CMSSW errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stderr:\n' msg += "".join(logTail) if os.path.exists(outLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stdout:\n' msg += "".join(logTail) # If it exists, grab the SCRAM log errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log') if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += "".join(logTail) # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) executor.report.addError(executor.step._internal_name, errCode, description, msg) return
def __call__(self, errCode, executor, **args): print "%s Diagnostic Handler invoked" % self.__class__.__name__ msg = "Error in CMSSW: %s\n" % (errCode) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) excepInst = args.get('ExceptionInstance', None) description = "Misc. CMSSW error" if excepInst: if hasattr(excepInst, 'detail'): description = excepInst.detail msg += str(excepInst) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it executor.report.parse(jobRepXml) reportStep = executor.report.retrieveStep(executor.step._internal_name) reportStep.status = errCode # Grab stderr log from CMSSW errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stderr:\n' msg += "".join(logTail) if os.path.exists(outLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stdout:\n' msg += "".join(logTail) # If it exists, grab the SCRAM log errLog = os.path.join(os.path.dirname(jobRepXml), 'scramOutput.log') if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += "".join(logTail) # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) executor.report.addError(executor.step._internal_name, errCode, description, msg) return
def __call__(self, errCode, executor, **args): """ Added for Steve to handle SCRAM script failure Must fail job (since SCRAM didn't run) """ msg = "SCRAM scripts failed to run!\n" if args.get('ExceptionInstance', False): msg += str(args.get('ExceptionInstance')) jobReport = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) errLog = os.path.join(os.path.dirname(jobReport), 'scramOutput.log') if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 25) msg += '\n Adding last ten lines of SCRAM error log:\n' msg += "".join(logTail) executor.report.addError(executor.step._internal_name, 50513, "SCRAMScriptFailure", msg) # Then mark the job as failed if executor.report.report.status == 0: executor.report.report.status = 1
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorOut = "condor.%s.out" % job['gridid'] condorErr = "condor.%s.err" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] for condorFile in [condorOut, condorErr, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n' % condorFile logOutput += '\n'.join(logTail) condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def __call__(self, errCode, executor, **args): print "%s Diagnostic Handler invoked" % self.__class__.__name__ msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep( executor.step._internal_name) reportStep.status = self.code errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stderr:\n' msg += "".join(logTail) if os.path.exists(outLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stdout:\n' msg += "".join(logTail) # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection == None: executor.report.addError(executor.step._internal_name, self.code, self.desc, msg) else: if not hasattr(errSection, self.desc): executor.report.addError(executor.step._internal_name, self.code, self.desc, msg) print executor.report.report.errors return
def __call__(self, errCode, executor, **args): print "%s Diagnostic Handler invoked" % self.__class__.__name__ msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc) jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) if os.path.exists(jobRepXml): # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file pass reportStep = executor.report.retrieveStep(executor.step._internal_name) reportStep.status = self.code errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stderr:\n' msg += "".join(logTail) if os.path.exists(outLog): logTail = BasicAlgos.tail(errLog, 10) msg += '\n Adding last ten lines of CMSSW stdout:\n' msg += "".join(logTail) # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection == None: executor.report.addError(executor.step._internal_name, self.code, self.desc, msg) else: if not hasattr(errSection, self.desc): executor.report.addError(executor.step._internal_name, self.code, self.desc, msg) print executor.report.report.errors return
def parseCondorLogs(logfile, extension): """ Retrieve the last X lines of the log file """ errLog = None logOut = '' logPaths = glob.glob(logfile) if len(logPaths): errLog = max(logPaths, key=lambda path: os.stat(path).st_mtime) if errLog is not None and os.path.isfile(errLog): logTail = BasicAlgos.tail(errLog, 50) logOut += 'Adding end of condor.%s to error message:\n' % extension logOut += logTail logOut += '\n\n' return logOut
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get("cache_dir", None) == None or job.get("retry_count", None) == None: # Then we can't do anything logging.error("Can't find this job's cache_dir in CondorPlugin.complete") logging.error("cache_dir: %s" % job.get("cache_dir", "Missing")) logging.error("retry_count: %s" % job.get("retry_count", "Missing")) continue reportName = os.path.join(job["cache_dir"], "Report.%i.pkl" % job["retry_count"]) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # Then we have a real report. # Do nothing continue if os.path.isdir(reportName): # Then something weird has happened. # File error, do nothing logging.error("Went to check on error report for job %i. Found a directory instead.\n" % job["id"]) logging.error("Ignoring this, but this is very strange.\n") # If we're still here, we must not have a real error report logOutput = "Could not find jobReport" logPath = os.path.join(job["cache_dir"], "condor.log") if os.path.isfile(logPath): logTail = BasicAlgos.tail(errLog, 50) logOutput += "Adding end of condor.log to error message:\n" logOutput += logTail condorReport = Report() condorReport.addError("NoJobReport", 61303, "NoJobReport", logOutput) condorReport.save(filename=reportName) logging.debug("No returning job report for job %i" % job["id"]) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get( 'retry_count', None) is None: # Then we can't do anything logging.error( "Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error( "The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp( condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp( condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job[ 'id'] msg += "Could not find jobCache directory %s\n" % job[ 'cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug( "Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) == None or job.get('retry_count', None) == None: # Then we can't do anything logging.error("Can't find this job's cache_dir in CondorPlugin.complete") logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing')) logging.error("retry_count: %s" % job.get('retry_count', 'Missing')) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # Then we have a real report. # Do nothing continue if os.path.isdir(reportName): # Then something weird has happened. # File error, do nothing logging.error("Went to check on error report for job %i. Found a directory instead.\n" % job['id']) logging.error("Ignoring this, but this is very strange.\n") # If we're still here, we must not have a real error report logOutput = 'Could not find jobReport\n' #But we don't know exactly the condor id, so it will append #the last lines of the latest condor log in cache_dir genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log') logPaths = glob.glob(genLogPath) errLog = None if len(logPaths): errLog = max(logPaths, key = lambda path : os.stat(path).st_mtime) if errLog != None and os.path.isfile(errLog): logTail = BasicAlgos.tail(errLog, 50) logOutput += 'Adding end of condor.log to error message:\n' logOutput += '\n'.join(logTail) if not os.path.isdir(job['cache_dir']): msg = "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown') msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir'] msg += "Creating artificial cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) logOutput += msg condorReport = Report() condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename = reportName) continue condorReport = Report() condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) if os.path.isfile(reportName): # Then we have a file already there. It should be zero size due # to the if statements above, but we should remove it. if os.path.getsize(reportName) > 0: # This should never happen. If it does, ignore it msg = "Critical strange problem. FWJR changed size while being processed." logging.error(msg) else: try: os.remove(reportName) condorReport.save(filename = reportName) except Exception as ex: logging.error("Cannot remove and replace empty report %s" % reportName) logging.error("Report continuing without error!") else: condorReport.save(filename = reportName) # Debug message to end loop logging.debug("No returning job report for job %i" % job['id']) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def __call__(self, errCode, executor, **args): """ _operator()_ Look for the XML job report, try and read it and extract the error information from it """ jobRepXml = os.path.join(executor.step.builder.workingDir, executor.step.output.jobReport) errLog = os.path.join(os.path.dirname(jobRepXml), '%s-stderr.log' % (executor.step._internal_name)) outLog = os.path.join(os.path.dirname(jobRepXml), '%s-stdout.log' % (executor.step._internal_name)) addOn = '\n' if os.path.exists(errLog): logTail = BasicAlgos.tail(errLog, 10) addOn += '\nAdding last ten lines of CMSSW stderr:\n' addOn += "".join(logTail) else: logging.error("No stderr from CMSSW") logging.error(os.listdir(os.path.basename(jobRepXml))) if os.path.exists(outLog): logTail = BasicAlgos.tail(errLog, 10) msg = '\n Adding last ten lines of CMSSW stdout:\n' msg += "".join(logTail) # Add the error we were sent ex = args.get('ExceptionInstance', None) executor.report.addError(executor.step._internal_name, errCode, "CMSSWStepFailure", msg + str(ex)) if not os.path.exists(jobRepXml): # no report => Error msg = "No Job Report Found: %s" % jobRepXml executor.report.addError(executor.step._internal_name, 50115, "MissingJobReport", msg) return # job report XML exists, load the exception information from it try: executor.report.parse(jobRepXml) except FwkJobReportException: # Job report is bad, the parse already puts a 50115 in the file # just go on pass # make sure the report has the error in it errSection = getattr(executor.report.report, "errors", None) if errSection == None: msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode msg += addOn executor.report.addError(executor.step._internal_name, 50116, "MissingErrorReport", msg) return else: # check exit code in report is non zero if executor.report.report.status == 0: msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode msg += addOn executor.report.addError(executor.step._internal_name, 50116, "MissingErrorReport", msg) else: msg = "Adding extra error in order to hold error report" msg += addOn executor.report.addError(executor.step._internal_name, 99999, "ErrorLoggingAddition", msg) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get( 'retry_count', None) is None: # Then we can't do anything logging.error( "Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error( "The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorOut = "condor.%s.out" % job['gridid'] condorErr = "condor.%s.err" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] for condorFile in [condorOut, condorErr, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n' % condorFile logOutput += '\n'.join(logTail) condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job[ 'id'] msg += "Could not find jobCache directory %s\n" % job[ 'cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug( "Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return