# Overwrite any existing errors if job.result[2] != 0: tolog("Encountered high priority error code %d (will overwrite error code %d)" % (pilotErrorCode, job.result[2])) else: tolog("Encountered high priority error code %d" % (pilotErrorCode)) job.result[2] = pilotErrorCode job.pilotErrorDiag = pilotErrorDiag else: tolog("Did not find any reported high priority errors") # send pilotErrorDiag for finished, failed and holding jobs if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': # get the pilot error diag from the right source if job.pilotErrorDiag: if job.pilotErrorDiag == "": node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2])) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag)) elif job.pilotErrorDiag.upper().find("<HTML>") >= 0: tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag)) node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag)) else: # truncate if necesary if len(job.pilotErrorDiag) > 250: tolog("pilotErrorDiag will be truncated to size 250") tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag)) job.pilotErrorDiag = job.pilotErrorDiag[:250] # set the pilotErrorDiag, but only the last 256 characters node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag)
experiment=job.experiment, cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, fileDestinationSE=job.fileDestinationSE, job=job, ) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env["errorLabel"], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env["errorLabel"] else: if pilotErrorDiag != "": pilotErrorDiag = "Put error: " + pUtil.tailPilotErrorDiag( pilotErrorDiag, size=256 - len("pilot: Put error: ") ) ec = rc log("Put function returned code: %d" % (rc)) if rc != 0: # remove any trailing "\r" or "\n" (there can be two of them) if rs is not None: rs = rs.rstrip() log(" Error string: %s" % (rs)) # is the job recoverable? if PilotErrors().isRecoverableErrorCode(rc): _state = ReturnCode.Holding _msg = "WARNING" else:
tin_1 = os.times() job.timeStageOut = int(round(tin_1[4] - tin_0[4])) if 'format_exc' in traceback.__all__: trace = traceback.format_exc() pilotErrorDiag = "Put function can not be called for staging out: %s, %s" % (str(e), trace) else: tolog("traceback.format_exc() not available in this python version") pilotErrorDiag = "Put function can not be called for staging out: %s" % (str(e)) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) rc = error.ERR_PUTFUNCNOCALL job.setState(["holding", job.result[1], rc]) else: if job.pilotErrorDiag != "": job.pilotErrorDiag = "Put error: " + tailPilotErrorDiag(job.pilotErrorDiag, size=256-len("pilot: Put error: ")) tolog("Put function returned code: %d" % (rc)) if rc != 0: # remove any trailing "\r" or "\n" (there can be two of them) if rs != None: rs = rs.rstrip() tolog("Error string: %s" % (rs)) # is the job recoverable? if error.isRecoverableErrorCode(rc): _state = "holding" _msg = "WARNING" else: _state = "failed" _msg = "FAILED"
thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), proxycheck=DorE(kwargs, 'proxycheckFlag'), pinitdir=DorE(kwargs, 'pilot_initdir'), datasetDict=datasetDict, stageoutTries=DorE(kwargs, 'stageoutretry'), cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, job=job) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env['errorLabel'] else: if pilotErrorDiag != "": pilotErrorDiag = "Put error: " + pUtil.tailPilotErrorDiag(pilotErrorDiag, size=256-len("pilot: Put error: ")) ec = rc log("Put function returned code: %d" % (rc)) if rc != 0: # remove any trailing "\r" or "\n" (there can be two of them) if rs is not None: rs = rs.rstrip() log(" Error string: %s" % (rs)) # is the job recoverable? if PilotErrors().isRecoverableErrorCode(rc): _state = ReturnCode.Holding _msg = "WARNING" else: _state = ReturnCode.FailedJob
def getNodeStructure(self, job, site, workerNode, spaceReport=False, log=None): """ define the node structure expected by the server """ node = {} node['node'] = workerNode.nodename node['workdir'] = job.workdir node['siteName'] = site.sitename node['jobId'] = job.jobId node['state'] = job.result[0] node['timestamp'] = timeStamp() if job.attemptNr > -1: node['attemptNr'] = job.attemptNr if self.__jobSchedulerId: node['schedulerID'] = self.__jobSchedulerId if self.__pilotId: # report the batch system job id, if available batchSystemType, _id = getBatchSystemJobID() if batchSystemType: tolog("Batch system: %s" % (batchSystemType)) tolog("Batch system job ID: %s" % (_id)) node['pilotID'] = "%s|%s|%s|%s|%s" % (self.__pilotId, _id, batchSystemType, self.__pilot_version_tag, self.__pilot_version) node['batchID'] = _id tolog("Will send batchID: %s and pilotID: %s" % (node['batchID'], node['pilotID'])) else: tolog("Batch system type was not identified (will not be reported)") node['pilotID'] = "%s|%s|%s" % (self.__pilotId, self.__pilot_version_tag, self.__pilot_version) tolog("Will send pilotID: %s" % (node['pilotID'])) tolog("pilotId: %s" % str(self.__pilotId)) if log and (job.result[0] == 'failed' or job.result[0] == 'holding' or "outbound connections" in log): node['pilotLog'] = log # build the jobMetrics node['jobMetrics'] = self.getJobMetrics(job, workerNode) # send pilotErrorDiag for finished, failed and holding jobs if job.result[0] == 'finished' or job.result[0] == 'failed' or job.result[0] == 'holding': # get the pilot error diag if job.pilotErrorDiag: if job.pilotErrorDiag == "": node['pilotErrorDiag'] = tailPilotErrorDiag(self.__error.getPilotErrorDiag(job.result[2])) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Empty pilotErrorDiag set to: %s" % (job.pilotErrorDiag)) elif job.pilotErrorDiag.upper().find("<HTML>") >= 0: tolog("Found html in pilotErrorDiag: %s" % (job.pilotErrorDiag)) node['pilotErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) job.pilotErrorDiag = node['pilotErrorDiag'] tolog("Updated pilotErrorDiag: %s" % (job.pilotErrorDiag)) else: # truncate if necesary if len(job.pilotErrorDiag) > 250: tolog("pilotErrorDiag will be truncated to size 250") tolog("Original pilotErrorDiag message: %s" % (job.pilotErrorDiag)) job.pilotErrorDiag = job.pilotErrorDiag[:250] # set the pilotErrorDiag, but only the last 256 characters node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) else: # set the pilotErrorDiag, but only the last 256 characters job.pilotErrorDiag = self.__error.getPilotErrorDiag(job.result[2]) node['pilotErrorDiag'] = tailPilotErrorDiag(job.pilotErrorDiag) tolog("Updated pilotErrorDiag from None: %s" % (job.pilotErrorDiag)) # get the number of events if job.nEvents != 0: node['nEvents'] = job.nEvents tolog("Total number of processed events: %d (read)" % (job.nEvents)) else: tolog("runJob did not report on the total number of read events") if job.result[0] == 'finished' or job.result[0] == 'failed': # make sure there is no mismatch between the transformation error codes (when both are reported) # send transformation errors depending on what is available if job.exeErrorDiag != "": node['exeErrorCode'] = job.exeErrorCode node['exeErrorDiag'] = job.exeErrorDiag else: node['transExitCode'] = job.result[1] if (job.result[0] == 'failed') and (job.exeErrorCode != 0) and (job.result[1] != job.exeErrorCode): if log: mismatch = "MISMATCH | Trf error code mismatch: exeErrorCode = %d, transExitCode = %d" %\ (job.exeErrorCode, job.result[1]) if node.has_key('pilotLog'): node['pilotLog'] = mismatch + node['pilotLog'] else: tolog("!!WARNING!!1300!! Could not write mismatch error to log extracts: %s" % mismatch) # check if Pilot-controlled resubmission is required: if (job.result[0] == "failed" and 'ANALY' in site.sitename): pilotExitCode = job.result[2] error = PilotErrors() if (error.isPilotResubmissionErrorCode(pilotExitCode) or job.isPilotResubmissionRequired): # negate PilotError, ensure it's negative job.result[2] = -abs(pilotExitCode) tolog("(Negated error code)") else: tolog("(No need to negate error code)") node['pilotErrorCode'] = job.result[2] tolog("Pilot error code: %d" % (node['pilotErrorCode'])) # report CPUTime and CPUunit at the end of the job node['cpuConsumptionTime'] = job.cpuConsumptionTime try: node['cpuConsumptionUnit'] = job.cpuConsumptionUnit + "+" + getCPUmodel() except: node['cpuConsumptionUnit'] = '?' node['cpuConversionFactor'] = job.cpuConversionFactor # report specific time measures # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut) node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup) # node['pilotTiming'] = "%s|%s|%s|%s|%s" % (str(job.timeGetJob), str(job.timeStageIn), str(job.timeExe), str(job.timeStageOut), str(job.timeSetup)) elif job.result[0] == 'holding': node['exeErrorCode'] = job.result[2] node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2]) else: node['cpuConsumptionUnit'] = getCPUmodel() if spaceReport and site.dq2space != -1: # non-empty string and the space check function runs well node['remainingSpace'] = site.dq2space node['messageLevel'] = site.dq2spmsg return node