def failJob(transExitCode, pilotExitCode, job, pilotserver, pilotport, ins=None, pilotErrorDiag=None, docleanup=True): """ set the fail code and exit """ job.setState(["failed", transExitCode, pilotExitCode]) if pilotErrorDiag: job.pilotErrorDiag = pilotErrorDiag tolog("Will now update local pilot TCP server") rt = RunJobUtilities.updatePilotServer(job, pilotserver, pilotport, final=True) if ins: ec = pUtil.removeFiles(job.workdir, ins) if docleanup: sysExit(job)
# update the job state file job.jobState = "running" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # run the job(s) ................................................................................... # Set ATLAS_CONDDB if necessary, and other env vars RunJobUtilities.setEnvVars(jobSite.sitename) # execute the payload res, job, getstatusoutput_was_interrupted, current_job_number = runJob.executePayload(thisExperiment, runCommandList, job) # if payload leaves the input files, delete them explicitly if ins: ec = pUtil.removeFiles(job.workdir, ins) # payload error handling ed = ErrorDiagnosis() if res[0] == None: job.jobState = "cancelled" job.setState(["cancelled", 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) else: job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: runJob.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) # stage-out ........................................................................................
# update the job state file job.jobState = "running" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # run the job(s) ................................................................................... # Set ATLAS_CONDDB if necessary, and other env vars RunJobUtilities.setEnvVars(jobSite.sitename) # execute the payload res, job, getstatusoutput_was_interrupted, current_job_number = runJob.executePayload( thisExperiment, runCommandList, job) # if payload leaves the input files, delete them explicitly if ins: ec = pUtil.removeFiles(job.workdir, ins) # payload error handling ed = ErrorDiagnosis() if res[0] == None: job.jobState = "cancelled" job.setState(["cancelled", 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort()) else: job = ed.interpretPayload(job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, runJob.getFailureCode())
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles(self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata([], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)
def finishJob(self): try: self.__hpcManager.finishJob() except: tolog(sys.exc_info()[1]) tolog(sys.exc_info()[2]) # If payload leaves the input files, delete them explicitly if self.__job.inFiles: ec = pUtil.removeFiles(self.__job.workdir, self.__job.inFiles) #if self.__output_es_files: # ec = pUtil.removeFiles("/", self.__output_es_files) errorCode = PilotErrors.ERR_UNKNOWN if self.__job.attemptNr < 4: errorCode = PilotErrors.ERR_ESRECOVERABLE #check HPC job status #if self.__hpcStatus: # self.failJob(0, 1220, self.__job, pilotErrorDiag="HPC job failed") if len(self.__eventRanges) == 0: tolog("Cannot get event ranges") self.failJob(0, errorCode, self.__job, pilotErrorDiag="Cannot get event ranges") # check whether all event ranges are handled tolog("Total event ranges: %s" % len(self.__eventRanges)) not_handled_events = self.__eventRanges.values().count('new') tolog("Not handled events: %s" % not_handled_events) done_events = self.__eventRanges.values().count('Done') tolog("Finished events: %s" % done_events) stagedOut_events = self.__eventRanges.values().count('stagedOut') tolog("stagedOut but not updated to panda server events: %s" % stagedOut_events) if done_events + stagedOut_events: errorCode = PilotErrors.ERR_ESRECOVERABLE if not_handled_events + stagedOut_events: tolog("Not all event ranges are handled. failed job") self.failJob( 0, errorCode, self.__job, pilotErrorDiag="Not All events are handled(total:%s, left:%s)" % (len(self.__eventRanges), not_handled_events + stagedOut_events)) dsname, datasetDict = self.getDatasets() tolog("dsname = %s" % (dsname)) tolog("datasetDict = %s" % (datasetDict)) # Create the output file dictionary needed for generating the metadata ec, pilotErrorDiag, outs, outsDict = RunJobUtilities.prepareOutFiles( self.__job.outFiles, self.__job.logFile, self.__job.workdir, fullpath=True) if ec: # missing output file (only error code from prepareOutFiles) self.failJob(self.__job.result[1], ec, self.__job, pilotErrorDiag=pilotErrorDiag) tolog("outsDict: %s" % str(outsDict)) # Create metadata for all successfully staged-out output files (include the log file as well, even if it has not been created yet) ec, job, outputFileInfo = self.createFileMetadata( [], self.__job, outsDict, dsname, datasetDict, self.__jobSite.sitename) if ec: self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) # Rename the metadata produced by the payload # if not pUtil.isBuildJob(outs): self.moveTrfMetadata(self.__job.workdir, self.__job.jobId) # Check the job report for any exit code that should replace the res_tuple[0] res0, exitAcronym, exitMsg = self.getTrfExitInfo(0, self.__job.workdir) res = (res0, exitMsg, exitMsg) # Payload error handling ed = ErrorDiagnosis() job = ed.interpretPayload(self.__job, res, False, 0, self.__runCommandList, self.getFailureCode()) if job.result[1] != 0 or job.result[2] != 0: self.failJob(job.result[1], job.result[2], job, pilotErrorDiag=job.pilotErrorDiag) self.__job = job job.jobState = "finished" job.setState([job.jobState, 0, 0]) job.jobState = job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), self.getPilotPort(), final=True) tolog("Done") self.sysExit(self.__job)