Пример #1
0
    def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None):
        """ define the node structure expected by the server """

        node = {}

        node['node'] = workerNode.nodename
        node['workdir'] = job.workdir
        node['siteName'] = site.sitename
        node['jobId'] = job.jobId
        node['state'] = job.result[0]
        node['timestamp'] = timeStamp()
        if job.attemptNr > -1:
            node['attemptNr'] = job.attemptNr
        if self.__jobSchedulerId:
            node['schedulerID'] = self.__jobSchedulerId
        if self.__pilotId:
            # report the batch system job id, if available
            batchSystemType, _id = getBatchSystemJobID()
            if batchSystemType:
                tolog("Batch system: %s" % (batchSystemType))
                tolog("Batch system job ID: %s" % (_id))
                node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version)
                node['batchID'] = _id
                tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID']))
            else:
                tolog("Batch system type was not identified (will not be reported)")
                node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version)
                tolog("Will send pilotID: %s" % (node['pilotID']))
            tolog("pilotId: %s" % str(self.__pilotId)) 
        if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log):
            node['pilotLog'] = log

        # add the startTime if the file exists
        _filename = 'START_TIME_%s' % (job.jobId)
        _path = os.path.join(self.__pilot_initdir, _filename)
        if os.path.exists(_path):
            startTime = readStringFromFile(_path)
            node['startTime'] = startTime

        # build the jobMetrics
        node['jobMetrics'] = self.getJobMetrics(job, workerNode)

        # for hpc status
        if job.hpcStatus:
            node['jobSubStatus'] = job.hpcStatus
        else:
            node['jobSubStatus'] = ''

        # check to see if there were any high priority errors reported
        errorInfo = getHighestPriorityError(job.jobId, self.__pilot_initdir)
        if errorInfo != {}:
            try:
                pilotErrorCode = errorInfo['pilotErrorCode']
                pilotErrorDiag = errorInfo['pilotErrorDiag']
            except Exception, e:
                tolog("!!WARNING!!2323!! Exception caught: %s" % (e))
            else:
                # Overwrite any existing errors
                if job.result[2] != 0:
                    tolog("Encountered high priority error code %d (will overwrite error code %d)" % (pilotErrorCode, job.result[2]))
                else:
                    tolog("Encountered high priority error code %d" % (pilotErrorCode))
                job.result[2] = pilotErrorCode
                job.pilotErrorDiag = pilotErrorDiag
Пример #2
0
    def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None):
        """ define the node structure expected by the server """

        node = {}

        node['node'] = workerNode.nodename
        node['workdir'] = job.workdir
        node['siteName'] = site.sitename
        node['jobId'] = job.jobId
        node['state'] = job.result[0]
        node['timestamp'] = timeStamp()
        if job.attemptNr > -1:
            node['attemptNr'] = job.attemptNr
        if self.__jobSchedulerId:
            node['schedulerID'] = self.__jobSchedulerId
        if self.__pilotId:
            # report the batch system job id, if available
            batchSystemType, _id = getBatchSystemJobID()
            if batchSystemType:
                tolog("Batch system: %s" % (batchSystemType))
                tolog("Batch system job ID: %s" % (_id))
                node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version)
                node['batchID'] = _id
                tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID']))
            else:
                tolog("Batch system type was not identified (will not be reported)")
                node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version)
                tolog("Will send pilotID: %s" % (node['pilotID']))
            tolog("pilotId: %s" % str(self.__pilotId)) 
        if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log):
            node['pilotLog'] = log

        # add the startTime if the file exists
        _filename = 'START_TIME_%s' % (job.jobId)
        _path = os.path.join(self.__pilot_initdir, _filename)
        if os.path.exists(_path):
            startTime = readStringFromFile(_path)
            node['startTime'] = startTime

        # build the jobMetrics
        node['jobMetrics'] = self.getJobMetrics(job, workerNode)

        # for hpc status
        if job.hpcStatus:
            node['jobSubStatus'] = job.hpcStatus
        else:
            node['jobSubStatus'] = ''

        # check to see if there were any high priority errors reported
        errorInfo = getHighestPriorityError(job.jobId, self.__pilot_initdir)
        if errorInfo != {}:
            try:
                pilotErrorCode = errorInfo['pilotErrorCode']
                pilotErrorDiag = errorInfo['pilotErrorDiag']
            except Exception, e:
                tolog("!!WARNING!!2323!! Exception caught: %s" % (e))
            else:
                # Overwrite any existing errors
                if job.result[2] != 0:
                    tolog("Encountered high priority error code %d (will overwrite error code %d)" % (pilotErrorCode, job.result[2]))
                else:
                    tolog("Encountered high priority error code %d" % (pilotErrorCode))
                job.result[2] = pilotErrorCode
                job.pilotErrorDiag = pilotErrorDiag
Пример #3
0
    def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None):
        """ define the node structure expected by the server """

        node = {}

        node['node'] = workerNode.nodename
        node['workdir'] = job.workdir
        node['siteName'] = site.sitename
        node['jobId'] = job.jobId
        node['state'] = job.result[0]
        node['timestamp'] = timeStamp()
        if job.attemptNr > -1:
            node['attemptNr'] = job.attemptNr
        if self.__jobSchedulerId:
            node['schedulerID'] = self.__jobSchedulerId
        if self.__pilotId:
            # report the batch system job id, if available
            batchSystemType, _id = getBatchSystemJobID()
            if batchSystemType:
                tolog("Batch system: %s" % (batchSystemType))
                tolog("Batch system job ID: %s" % (_id))
                node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version)
                node['batchID'] = _id
                tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID']))
            else:
                tolog("Batch system type was not identified (will not be reported)")
                node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version)
                tolog("Will send pilotID: %s" % (node['pilotID']))
            tolog("pilotId: %s" % str(self.__pilotId)) 
        if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log):
            node['pilotLog'] = log

        # build the jobMetrics
        node['jobMetrics'] = self.getJobMetrics(job, workerNode)

        # send pilotErrorDiag for finished, failed and holding jobs
        if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding':
            # get the pilot error diag
            if job.pilotErrorDiag:
                if job.pilotErrorDiag == "":
                    node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2]))
                    job.pilotErrorDiag = node['pilotErrorDiag']
                    tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag))
                elif job.pilotErrorDiag.upper().find("<HTML>") >= 0:
                    tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag))
                    node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2])
                    job.pilotErrorDiag = node['pilotErrorDiag']
                    tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag))
                else:
                    # truncate if necesary
                    if len(job.pilotErrorDiag) > 250:
                        tolog("pilotErrorDiag will be truncated to size 250")
                        tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag))
                        job.pilotErrorDiag = job.pilotErrorDiag[:250]
                    # set the pilotErrorDiag, but only the last 256 characters
                    node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag)
            else:
                # set the pilotErrorDiag, but only the last 256 characters
                job.pilotErrorDiag = self.__error.getPilotErrorDiag(job.result[2])
                node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag)
                tolog("Updated pilotErrorDiag from None: %s" % (job.pilotErrorDiag))

            # get the number of events
            if job.nEvents != 0:
                node['nEvents'] = job.nEvents
                tolog("Total number of processed events: %d (read)" % (job.nEvents))
            else:
                tolog("runJob did not report on the total number of read events")

        if job.result[0] == 'finished' or job.result[0] == 'failed':
            # make sure there is no mismatch between the transformation error codes (when both are reported)
            # send transformation errors depending on what is available
            if job.exeErrorDiag != "":
                node['exeErrorCode'] = job.exeErrorCode
                node['exeErrorDiag'] = job.exeErrorDiag
            else:
                node['transExitCode'] = job.result[1]
            if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode):
                if log:
                    mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\
                               (job.exeErrorCode, job.result[1])
                    if node.has_key('pilotLog'):
                        node['pilotLog'] = mismatch + node['pilotLog']
                    else:
                        tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch)

            # check if Pilot-controlled resubmission is required:
            if (job.result[0] == "failed" and 'ANALY' in site.sitename):
                pilotExitCode = job.result[2]
                error = PilotErrors()
                if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired):
                    # negate PilotError, ensure it's negative
                    job.result[2] = -abs(pilotExitCode)
                    tolog("(Negated error code)")
                else:
                    tolog("(No need to negate error code)")

            node['pilotErrorCode'] = job.result[2]
            tolog("Pilot error code: %d" % (node['pilotErrorCode']))

            # report CPUTime and CPUunit at the end of the job
            node['cpuConsumptionTime'] = job.cpuConsumptionTime
            try:
                node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel()
            except:
                node['cpuConsumptionUnit'] = '?'
            node['cpuConversionFactor'] = job.cpuConversionFactor

            # report specific time measures
            # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut)
            node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup)
#            node['pilotTiming'] = "%s|%s|%s|%s|%s" % (str(job.timeGetJob), str(job.timeStageIn), str(job.timeExe), str(job.timeStageOut), str(job.timeSetup))
        elif job.result[0] == 'holding':
            node['exeErrorCode'] = job.result[2]
            node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2])

        else:
            node['cpuConsumptionUnit'] = getCPUmodel()

        if spaceReport and site.dq2space != -1: # non-empty string and the space check function runs well
            node['remainingSpace'] = site.dq2space
            node['messageLevel'] = site.dq2spmsg

        return node