def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None): """ define the node structure expected by the server """ node = {} node['node'] = workerNode.nodename node['workdir'] = job.workdir node['siteName'] = site.sitename node['jobId'] = job.jobId node['state'] = job.result[0] node['timestamp'] = timeStamp() if job.attemptNr > -1: node['attemptNr'] = job.attemptNr if self.__jobSchedulerId: node['schedulerID'] = self.__jobSchedulerId if self.__pilotId: # report the batch system job id, if available batchSystemType, _id = getBatchSystemJobID() if batchSystemType: tolog("Batch system: %s" % (batchSystemType)) tolog("Batch system job ID: %s" % (_id)) node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version) node['batchID'] = _id tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID'])) else: tolog("Batch system type was not identified (will not be reported)") node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version) tolog("Will send pilotID: %s" % (node['pilotID'])) tolog("pilotId: %s" % str(self.__pilotId)) if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log): node['pilotLog'] = log # add the startTime if the file exists _filename = 'START_TIME_%s' % (job.jobId) _path = os.path.join(self.__pilot_initdir, _filename) if os.path.exists(_path): startTime = readStringFromFile(_path) node['startTime'] = startTime # build the jobMetrics node['jobMetrics'] = self.getJobMetrics(job, workerNode) # for hpc status if job.hpcStatus: node['jobSubStatus'] = job.hpcStatus else: node['jobSubStatus'] = '' # check to see if there were any high priority errors reported errorInfo = getHighestPriorityError(job.jobId, self.__pilot_initdir) if errorInfo != {}: try: pilotErrorCode = errorInfo['pilotErrorCode'] pilotErrorDiag = errorInfo['pilotErrorDiag'] except Exception, e: tolog("!!WARNING!!2323!! Exception caught: %s" % (e)) else: # Overwrite any existing errors if job.result[2] != 0: tolog("Encountered high priority error code %d (will overwrite error code %d)" % (pilotErrorCode, job.result[2])) else: tolog("Encountered high priority error code %d" % (pilotErrorCode)) job.result[2] = pilotErrorCode job.pilotErrorDiag = pilotErrorDiag
def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None): """ define the node structure expected by the server """ node = {} node['node'] = workerNode.nodename node['workdir'] = job.workdir node['siteName'] = site.sitename node['jobId'] = job.jobId node['state'] = job.result[0] node['timestamp'] = timeStamp() if job.attemptNr > -1: node['attemptNr'] = job.attemptNr if self.__jobSchedulerId: node['schedulerID'] = self.__jobSchedulerId if self.__pilotId: # report the batch system job id, if available batchSystemType, _id = getBatchSystemJobID() if batchSystemType: tolog("Batch system: %s" % (batchSystemType)) tolog("Batch system job ID: %s" % (_id)) node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version) node['batchID'] = _id tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID'])) else: tolog("Batch system type was not identified (will not be reported)") node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version) tolog("Will send pilotID: %s" % (node['pilotID'])) tolog("pilotId: %s" % str(self.__pilotId)) if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log): node['pilotLog'] = log # build the jobMetrics node['jobMetrics'] = self.getJobMetrics(job, workerNode) # send pilotErrorDiag for finished, failed and holding jobs if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': # get the pilot error diag if job.pilotErrorDiag: if job.pilotErrorDiag == "": node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2])) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag)) elif job.pilotErrorDiag.upper().find("<HTML>") >= 0: tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag)) node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag)) else: # truncate if necesary if len(job.pilotErrorDiag) > 250: tolog("pilotErrorDiag will be truncated to size 250") tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag)) job.pilotErrorDiag = job.pilotErrorDiag[:250] # set the pilotErrorDiag, but only the last 256 characters node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) else: # set the pilotErrorDiag, but only the last 256 characters job.pilotErrorDiag = self.__error.getPilotErrorDiag(job.result[2]) node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) tolog("Updated pilotErrorDiag from None: %s" % (job.pilotErrorDiag)) # get the number of events if job.nEvents != 0: node['nEvents'] = job.nEvents tolog("Total number of processed events: %d (read)" % (job.nEvents)) else: tolog("runJob did not report on the total number of read events") if job.result[0] == 'finished' or job.result[0] == 'failed': # make sure there is no mismatch between the transformation error codes (when both are reported) # send transformation errors depending on what is available if job.exeErrorDiag != "": node['exeErrorCode'] = job.exeErrorCode node['exeErrorDiag'] = job.exeErrorDiag else: node['transExitCode'] = job.result[1] if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode): if log: mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\ (job.exeErrorCode, job.result[1]) if node.has_key('pilotLog'): node['pilotLog'] = mismatch + node['pilotLog'] else: tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch) # check if Pilot-controlled resubmission is required: if (job.result[0] == "failed" and 'ANALY' in site.sitename): pilotExitCode = job.result[2] error = PilotErrors() if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired): # negate PilotError, ensure it's negative job.result[2] = -abs(pilotExitCode) tolog("(Negated error code)") else: tolog("(No need to negate error code)") node['pilotErrorCode'] = job.result[2] tolog("Pilot error code: %d" % (node['pilotErrorCode'])) # report CPUTime and CPUunit at the end of the job node['cpuConsumptionTime'] = job.cpuConsumptionTime try: node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel() except: node['cpuConsumptionUnit'] = '?' node['cpuConversionFactor'] = job.cpuConversionFactor # report specific time measures # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut) node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup) # node['pilotTiming'] = "%s|%s|%s|%s|%s" % (str(job.timeGetJob), str(job.timeStageIn), str(job.timeExe), str(job.timeStageOut), str(job.timeSetup)) elif job.result[0] == 'holding': node['exeErrorCode'] = job.result[2] node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) else: node['cpuConsumptionUnit'] = getCPUmodel() if spaceReport and site.dq2space != -1: # non-empty string and the space check function runs well node['remainingSpace'] = site.dq2space node['messageLevel'] = site.dq2spmsg return node