Пример #1
0
    def __call__(self, errCode, executor, **args):
        logging.critical("%s Diagnostic Handler invoked", self.__class__.__name__)
        msg = "Error in CMSSW: %s\n" % (errCode)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        excepInst = args.get('ExceptionInstance', None)

        description = "Misc. CMSSW error"
        if excepInst:
            if hasattr(excepInst, 'detail'):
                description = excepInst.detail
            msg += str(excepInst)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(executor.stepName)
            reportStep.status = errCode

        # Grab stderr log from CMSSW
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.stepName))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.stepName))


        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail
        if os.path.exists(outLog):
            logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # If it exists, grab the SCRAM log
        errLog = os.path.join(os.path.dirname(jobRepXml),
                              'scramOutput.log')

        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, 25)
            msg += '\n Adding last ten lines of SCRAM error log:\n'
            msg += logTail

        # make sure the report has the error in it
        dummy = getattr(executor.report.report, "errors", None)  # Seems to do nothing
        executor.report.addError(executor.stepName,
                                 errCode, description, msg)

        return
Пример #2
0
    def __call__(self, errCode, executor, **args):
        """
        Added for Steve to handle SCRAM script failure

        Must fail job (since SCRAM didn't run)

        """
        msg = "SCRAM scripts failed to run!\n"
        if args.get('ExceptionInstance', False):
            msg += str(args.get('ExceptionInstance'))

        jobReport = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)
        errLog = os.path.join(os.path.dirname(jobReport),
                              'scramOutput.log')

        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of SCRAM error log:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        executor.report.addError(executor.stepName,
                                 50513, "SCRAMScriptFailure", msg)

        # Then mark the job as failed
        if executor.report.report.status == 0:
            executor.report.report.status = 1
Пример #3
0
    def __call__(self, errCode, executor, **args):
        print("%s Diagnostic Handler invoked" % self.__class__.__name__)
        msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                self.parse(executor, jobRepXml)
            except FwkJobReportException:    
                # Job report is bad, the parse already puts a 50115 in the file    
                pass

            reportStep = executor.report.retrieveStep(executor.stepName)
            reportStep.status = self.code

        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.stepName))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.stepName))

        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail
        if os.path.exists(outLog):
            logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection is None:
            executor.report.addError(executor.stepName,
                                     self.code, self.desc, msg)
        else:
            if not hasattr(errSection, self.desc):
                executor.report.addError(executor.stepName,
                                         self.code, self.desc, msg)

        print(executor.report.report.errors)
        return
Пример #4
0
    def __call__(self, errCode, executor, **args):
        print("%s Diagnostic Handler invoked" % self.__class__.__name__)
        msg = "Exit %s: %s Exception from cmsRun" % (self.code, self.desc)
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        if os.path.exists(jobRepXml):
            # job report XML exists, load the exception information from it
            try:
                executor.report.parse(jobRepXml)
            except FwkJobReportException:
                # Job report is bad, the parse already puts a 50115 in the file
                pass
            reportStep = executor.report.retrieveStep(executor.stepName)
            reportStep.status = self.code

        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.stepName))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.stepName))

        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stderr:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail
        if os.path.exists(outLog):
            logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg += '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection == None:
            executor.report.addError(executor.stepName,
                                     self.code, self.desc, msg)
        else:
            if not hasattr(errSection, self.desc):
                executor.report.addError(executor.stepName,
                                         self.code, self.desc, msg)

        print(executor.report.report.errors)
        return
Пример #5
0
    def test_tail(self):
        """
        _tail_

        Can we tail a file?
        """

        a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"

        f = open('tmpfile.tmp', 'w')
        f.write(a)
        f.close()

        self.assertEqual(FileTools.tail('tmpfile.tmp', 10), "g\nh\ni\nj\nk\nl\nm\nn\no\np\n")

        self.assertEqual(FileTools.tail('tmpfile.tmp', 2), "o\np\n")

        os.remove('tmpfile.tmp')

        return
Пример #6
0
    def test_tail(self):
        """
        _tail_

        Can we tail a file?
        """

        a = "a\nb\nc\nd\ne\nf\ng\nh\ni\nj\nk\nl\nm\nn\no\np\n"

        f = open('tmpfile.tmp', 'w')
        f.write(a)
        f.close()

        self.assertEqual(FileTools.tail('tmpfile.tmp', 10), "g\nh\ni\nj\nk\nl\nm\nn\no\np\n")

        self.assertEqual(FileTools.tail('tmpfile.tmp', 2), "o\np\n")

        os.remove('tmpfile.tmp')

        return
Пример #7
0
    def __call__(self, errCode, executor, **args):
        """
        _operator()_

        Look for the XML job report, try and read it and extract the error information from it

        """
        jobRepXml = os.path.join(executor.step.builder.workingDir,
                                 executor.step.output.jobReport)

        errLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stderr.log' % (executor.stepName))
        outLog = os.path.join(os.path.dirname(jobRepXml),
                              '%s-stdout.log' % (executor.stepName))

        addOn = '\n'
        if os.path.exists(errLog):
            logTail = FileTools.tail(errLog, 10)
            addOn += '\nAdding last ten lines of CMSSW stderr:\n'
            addOn += logTail
        else:
            logging.error("No stderr from CMSSW")
            logging.error(os.listdir(os.path.basename(jobRepXml)))

        if os.path.exists(outLog):
            logTail = FileTools.tail(outLog, DEFAULT_TAIL_LINES_FROM_LOG)
            msg = '\n Adding last %s lines of CMSSW stdout:\n' % DEFAULT_TAIL_LINES_FROM_LOG
            msg += logTail

        # Add the error we were sent
        ex = args.get('ExceptionInstance', None)
        executor.report.addError(executor.stepName,
                                 errCode, "CMSSWStepFailure", msg + str(ex))

        if not os.path.exists(jobRepXml):
            # no report => Error
            msg = "No Job Report Found: %s" % jobRepXml
            executor.report.addError(executor.stepName,
                                     50115, "MissingJobReport", msg)
            return

        # job report XML exists, load the exception information from it
        try:
            executor.report.parse(jobRepXml)
        except FwkJobReportException:
            # Job report is bad, the parse already puts a 50115 in the file
            # just go on
            pass

        # make sure the report has the error in it
        errSection = getattr(executor.report.report, "errors", None)
        if errSection == None:
            msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode
            msg += addOn
            executor.report.addError(executor.stepName,
                                     50116, "MissingErrorReport", msg)
            return

        else:
            # check exit code in report is non zero
            if executor.report.report.status == 0:
                msg = "Job Report contains no error report, but cmsRun exited non-zero: %s" % errCode
                msg += addOn
                executor.report.addError(executor.stepName,
                                         50116, "MissingErrorReport", msg)

            else:
                msg = "Adding extra error in order to hold error report"
                msg += addOn
                executor.report.addError(executor.stepName,
                                         99999, "ErrorLoggingAddition", msg)
        return
Пример #8
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = FileTools.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType, logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return
Пример #9
0
    def complete(self, jobs):
        """
        Do any completion work required

        In this case, look for a returned logfile
        """

        for job in jobs:

            if job.get('cache_dir', None) is None or job.get('retry_count', None) is None:
                # Then we can't do anything
                logging.error("Can't find this job's cache_dir or retry count: %s", job)
                continue

            reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count'])
            if os.path.isfile(reportName) and os.path.getsize(reportName) > 0:
                # everything in order, move on
                continue
            elif os.path.isdir(reportName):
                # Then something weird has happened. Report error, do nothing
                logging.error("The job report for job with id %s and gridid %s is a directory", job['id'],
                              job['gridid'])
                logging.error("Ignoring this, but this is very strange")
            else:
                logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid'])

                if os.path.isfile(reportName):
                    os.remove(reportName)

                # create a report from scratch
                condorReport = Report()
                logOutput = 'Could not find jobReport\n'

                if os.path.isdir(job['cache_dir']):
                    condorErr = "condor.%s.err" % job['gridid']
                    condorOut = "condor.%s.out" % job['gridid']
                    condorLog = "condor.%s.log" % job['gridid']
                    exitCode = 99303
                    exitType = "NoJobReport"
                    for condorFile in [condorErr, condorOut, condorLog]:
                        condorFilePath = os.path.join(job['cache_dir'], condorFile)
                        logOutput += "\n========== %s ==========\n" % condorFile
                        if os.path.isfile(condorFilePath):
                            logTail = FileTools.tail(condorFilePath, 50)
                            logOutput += 'Adding end of %s to error message:\n\n' % condorFile
                            logOutput += logTail
                            logOutput += '\n\n'

                            if condorFile == condorLog:
                                # for condor log, search for the information
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP):
                                    condorReason = matchObj.group("Reason")
                                    if condorReason:
                                        logOutput += condorReason
                                        if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason:
                                            exitCode = 99400
                                            exitType = "RemovedByGLIDEIN"
                                        else:
                                            exitCode = 99401

                                    siteName = matchObj.group("Site")
                                    if siteName:
                                        condorReport.data.siteName = siteName
                                    else:
                                        condorReport.data.siteName = "NoReportedSite"
                            else:
                                for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP):
                                    errMsg = matchObj.group('WMException')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                                    errMsg = matchObj.group('ERROR')
                                    if errMsg:
                                        logOutput += "\n\n%s\n" % errMsg

                    logOutput += '\n\n'
                    condorReport.addError(exitType, exitCode, exitType, logOutput)
                else:
                    msg = "Serious Error in Completing condor job with id %s!\n" % job['id']
                    msg += "Could not find jobCache directory %s\n" % job['cache_dir']
                    msg += "Creating a new cache_dir for failed job report\n"
                    logging.error(msg)
                    os.makedirs(job['cache_dir'])
                    condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput)

                condorReport.save(filename=reportName)

                logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid'])

        return