def isSameType(self, trf, userflag): """ is the lost job of same type as the current pilot? """ # treat userflag 'self' as 'user' if userflag == 'self': userflag = 'user' if (isAnalysisJob(trf) and userflag == 'user') or \ (not isAnalysisJob(trf) and userflag != 'user'): sametype = True if userflag == 'user': tolog( "Lost job is of same type as current pilot (analysis pilot, lost analysis job trf: %s)" % (trf)) else: tolog( "Lost job is of same type as current pilot (production pilot, lost production job trf: %s)" % (trf)) else: sametype = False if userflag == 'user': tolog( "Lost job is not of same type as current pilot (analysis pilot, lost production job trf: %s)" % (trf)) else: tolog( "Lost job is not of same type as current pilot (production pilot, lost analysis job trf: %s)" % (trf)) return sametype
def TransferFiles(job_state, datadir, files, **kwargs): """ Transfers files from list 'files' May change CWD with pUtil.chdir (several times) :param job_state: :param datadir: job data dir :param files: list of filenames :param kwargs: specific arguments for other purposes :return: """ job = job_state.job pUtil.chdir(datadir) XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId) thisSite = DorE(kwargs, 'thisSite') if not setGuids(job_state, files, **kwargs): job.result[2] = PilotErrors().ERR_LOSTJOBPFC return ReturnCode.FailedJob outPFC = updateOutPFC(job, **kwargs) if not outPFC: return ReturnCode.FailedJob dsname = defaultDSname(job.destinationDblock) datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock) if not datasetDict: log("Output files will go to default dataset: %s" % (dsname)) # the cmtconfig is needed by at least the xrdcp site mover cmtconfig = pUtil.getCmtconfig(job.cmtconfig) tin_0 = os.times() rf = None _state = ReturnCode.OK _msg = "" ec = -1 try: # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function) rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data( "xmlcatalog_file:%s" % outPFC, dsname, thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), proxycheck=DorE(kwargs, 'proxycheckFlag'), pinitdir=DorE(kwargs, 'pilot_initdir'), datasetDict=datasetDict, stageoutTries=DorE(kwargs, 'stageoutretry'), cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir, job=job) except Exception, e: pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e) log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag)) ec = PilotErrors().ERR_PUTFUNCNOCALL _state = ReturnCode.Holding _msg = env['errorLabel']
def setParameters(self, *args, **kwargs): """ Set any internally needed variables """ # set initial values self.__job = kwargs.get("job", None) if self.__job: self.__analysisJob = isAnalysisJob(self.__job.trf) else: self.__warning = "setParameters found no job object"
def setParameters(self, *args, **kwargs): """ Set any internally needed variables """ # set initial values self.__job = kwargs.get('job', None) if self.__job: self.__analysisJob = isAnalysisJob(self.__job.trf) else: self.__warning = "setParameters found no job object"
def updateOutPFC(job, **kwargs): file_name = "OutPutFileCatalog.xml" file_path = os.path.join(DorE(kwargs, 'thisSite').workdir, file_name) try: guids_status = pUtil.PFCxml(job.experiment, file_path, job.outFiles, fguids=job.outFilesGuids, fntag="pfn", analJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), jr=True) except Exception, e: log("!!FAILED!!1105!! Exception caught (Could not generate xml for the remaining output files): %s" % str(e)) job.result[2] = PilotErrors().ERR_LOSTJOBXML return False
def isSameType(self, trf, userflag): """ is the lost job of same type as the current pilot? """ # treat userflag 'self' as 'user' if userflag == 'self': userflag = 'user' if (isAnalysisJob(trf) and userflag == 'user') or \ (not isAnalysisJob(trf) and userflag != 'user'): sametype = True if userflag == 'user': tolog("Lost job is of same type as current pilot (analysis pilot, lost analysis job trf: %s)" % (trf)) else: tolog("Lost job is of same type as current pilot (production pilot, lost production job trf: %s)" % (trf)) else: sametype = False if userflag == 'user': tolog("Lost job is not of same type as current pilot (analysis pilot, lost production job trf: %s)" % (trf)) else: tolog("Lost job is not of same type as current pilot (production pilot, lost analysis job trf: %s)" % (trf)) return sametype
runJob.setGlobalErrorCode(error.ERR_SIGUSR1) else: runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL) runJob.setFailureCode(runJob.getGlobalErrorCode) # print to stderr print >> sys.stderr, runJob.getGlobalPilotErrorDiag() raise SystemError(sig) signal.signal(signal.SIGTERM, sig2exc) signal.signal(signal.SIGQUIT, sig2exc) signal.signal(signal.SIGSEGV, sig2exc) signal.signal(signal.SIGXCPU, sig2exc) signal.signal(signal.SIGBUS, sig2exc) # see if it's an analysis job or not analysisJob = isAnalysisJob(job.trf.split(",")[0]) if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0]) rt = RunJobUtilities.updatePilotServer(job, runJob.getPilotServer(), runJob.getPilotPort())
runJob.setGlobalErrorCode(error.ERR_SIGUSR1) else: runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL) runJob.setFailureCode(runJob.getGlobalErrorCode) # print to stderr print >> sys.stderr, runJob.getGlobalPilotErrorDiag() raise SystemError(sig) signal.signal(signal.SIGTERM, sig2exc) signal.signal(signal.SIGQUIT, sig2exc) signal.signal(signal.SIGSEGV, sig2exc) signal.signal(signal.SIGXCPU, sig2exc) signal.signal(signal.SIGBUS, sig2exc) # see if it's an analysis job or not analysisJob = isAnalysisJob(job.trf.split(",")[0]) if analysisJob: tolog("User analysis job") else: tolog("Production job") tolog("runJob received a job with prodSourceLabel=%s" % (job.prodSourceLabel)) # setup starts here ................................................................................ # update the job state file job.jobState = "setup" #_retjs = JR.updateJobStateTest(job, jobSite, node, mode="test") # send [especially] the process group back to the pilot job.setState([job.jobState, 0, 0])
def getJobExecutionCommand(self, job, jobSite, pilot_initdir): """ Define and test the command(s) that will be used to execute the payload """ # Input tuple: (method is called from RunJob*) # job: Job object # jobSite: Site object # pilot_initdir: launch directory of pilot.py # # Return tuple: # pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig # where # pilot_error_code : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup) # pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics # job_execution_command : command to execute payload, e.g. cmd = "source <path>/setup.sh; <path>/python trf.py [options]" # special_setup_command : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods # JEM : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See JEMstub.py) # cmtconfig : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt" [NOT USED IN THIS CLASS] pilotErrorDiag = "" cmd = "" special_setup_cmd = "" pysiteroot = "" siteroot = "" JEM = "NO" cmtconfig = "" # Is it's an analysis job or not? analysisJob = isAnalysisJob(job.trf) # Set the INDS env variable (used by runAthena) if analysisJob: self.setINDS(job.realDatasetsIn) # Command used to download runAthena or runGen wgetCommand = "wget" # special setup for NG status, pilotErrorDiag, cmd = self.setupNordugridTrf(job, analysisJob, wgetCommand, pilot_initdir) if status != 0: return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig # add FRONTIER debugging and RUCIO env variables cmd = self.addEnvVars2Cmd(cmd, job.jobId, job.processingType, jobSite.sitename, analysisJob) if readpar("cloud") == "DE": # Should JEM be used? metaOut = {} try: import sys from JEMstub import updateRunCommand4JEM # If JEM should be used, the command will get updated by the JEMstub automatically. cmd = updateRunCommand4JEM(cmd, job, jobSite, tolog, metaOut=metaOut) except: # On failure, cmd stays the same tolog("Failed to update run command for JEM - will run unmonitored.") # Is JEM to be used? if metaOut.has_key("JEMactive"): JEM = metaOut["JEMactive"] tolog("Use JEM: %s (dictionary = %s)" % (JEM, str(metaOut))) elif "--enable-jem" in cmd: tolog("!!WARNING!!1111!! JEM can currently only be used on certain sites in DE") # Pipe stdout/err for payload to files cmd += " 1>%s 2>%s" % (job.stdout, job.stderr) tolog("\nCommand to run the job is: \n%s" % (cmd)) tolog("ATLAS_PYTHON_PILOT = %s" % (os.environ["ATLAS_PYTHON_PILOT"])) if special_setup_cmd != "": tolog("Special setup command: %s" % (special_setup_cmd)) return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
def getJobExecutionCommandObsolete(self, job, jobSite, pilot_initdir): """ Define and test the command(s) that will be used to execute the payload """ # Input tuple: (method is called from RunJob*) # job: Job object # jobSite: Site object # pilot_initdir: launch directory of pilot.py # # Return tuple: # pilot_error_code, pilot_error_diagnostics, job_execution_command, special_setup_command, JEM, cmtconfig # where # pilot_error_code : self.__error.<PILOT ERROR CODE as defined in PilotErrors class> (value should be 0 for successful setup) # pilot_error_diagnostics: any output from problematic command or explanatory error diagnostics # job_execution_command : command to execute payload, e.g. cmd = "source <path>/setup.sh; <path>/python trf.py [options]" # special_setup_command : any special setup command that can be insterted into job_execution_command and is sent to stage-in/out methods # JEM : Job Execution Monitor activation state (default value "NO", meaning JEM is not to be used. See JEMstub.py) # cmtconfig : cmtconfig symbol from the job def or schedconfig, e.g. "x86_64-slc5-gcc43-opt" [NOT USED IN THIS CLASS] pilotErrorDiag = "" cmd = "" special_setup_cmd = "" pysiteroot = "" siteroot = "" JEM = "NO" cmtconfig = "" # Is it's an analysis job or not? analysisJob = isAnalysisJob(job.trf) # Set the INDS env variable (used by runAthena) if analysisJob: self.setINDS(job.realDatasetsIn) # Command used to download runAthena or runGen wgetCommand = 'wget' # special setup for NG status, pilotErrorDiag, cmd = self.setupNordugridTrf( job, analysisJob, wgetCommand, pilot_initdir) if status != 0: return status, pilotErrorDiag, "", special_setup_cmd, JEM, cmtconfig # add FRONTIER debugging and RUCIO env variables cmd = self.addEnvVars2Cmd(cmd, job.jobId, job.taskID, job.processingType, jobSite.sitename, analysisJob) if readpar('cloud') == "DE": # Should JEM be used? metaOut = {} try: import sys from JEMstub import updateRunCommand4JEM # If JEM should be used, the command will get updated by the JEMstub automatically. cmd = updateRunCommand4JEM(cmd, job, jobSite, tolog, metaOut=metaOut) except: # On failure, cmd stays the same tolog( "Failed to update run command for JEM - will run unmonitored." ) # Is JEM to be used? if metaOut.has_key("JEMactive"): JEM = metaOut["JEMactive"] tolog("Use JEM: %s (dictionary = %s)" % (JEM, str(metaOut))) elif '--enable-jem' in cmd: tolog( "!!WARNING!!1111!! JEM can currently only be used on certain sites in DE" ) # Pipe stdout/err for payload to files cmd += " 1>%s 2>%s" % (job.stdout, job.stderr) tolog("\nCommand to run the job is: \n%s" % (cmd)) tolog("ATLAS_PYTHON_PILOT = %s" % (os.environ['ATLAS_PYTHON_PILOT'])) if special_setup_cmd != "": tolog("Special setup command: %s" % (special_setup_cmd)) return 0, pilotErrorDiag, cmd, special_setup_cmd, JEM, cmtconfig
class RunJobHpcEvent(RunJob): # private data members __runjob = "RunJobHpcEvent" # String defining the sub class __instance = None # Boolean used by subclasses to become a Singleton #__error = PilotErrors() # PilotErrors object # Required methods def __init__(self): """ Default initialization """ # e.g. self.__errorLabel = errorLabel pass self.__output_es_files = [] self.__eventRanges = {} self.__failedStageOuts = [] self._hpcManager = None def __new__(cls, *args, **kwargs): """ Override the __new__ method to make the class a singleton """ if not cls.__instance: cls.__instance = super(RunJob, cls).__new__(cls, *args, **kwargs) return cls.__instance def getRunJob(self): """ Return a string with the experiment name """ return self.__runjob def getRunJobFileName(self): """ Return the filename of the module """ return super(RunJobHpcEvent, self).getRunJobFileName() # def argumentParser(self): <-- see example in RunJob.py def allowLoopingJobKiller(self): """ Should the pilot search for looping jobs? """ # The pilot has the ability to monitor the payload work directory. If there are no updated files within a certain # time limit, the pilot will consider the as stuck (looping) and will kill it. The looping time limits are set # in environment.py (see e.g. loopingLimitDefaultProd) return False def setupHPCEvent(self): self.__jobSite = Site.Site() self.__jobSite.setSiteInfo(self.argumentParser()) ## For HPC job, we don't need to reassign the workdir # reassign workdir for this job self.__jobSite.workdir = self.__jobSite.wntmpdir if not os.path.exists(self.__jobSite.workdir): os.makedirs(self.__jobSite.workdir) tolog("runJobHPCEvent.getPilotLogFilename=%s" % self.getPilotLogFilename()) if self.getPilotLogFilename() != "": pUtil.setPilotlogFilename(self.getPilotLogFilename()) # set node info self.__node = Node.Node() self.__node.setNodeName(os.uname()[1]) self.__node.collectWNInfo(self.__jobSite.workdir) # redirect stderr #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w") tolog("Current job workdir is: %s" % os.getcwd()) tolog("Site workdir is: %s" % self.__jobSite.workdir) # get the experiment object self.__thisExperiment = getExperiment(self.getExperiment()) tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment())) def getHPCEventJobFromPanda(self): pass def getHPCEventJobFromEnv(self): tolog("getHPCEventJobFromEnv") try: # always use this filename as the new jobDef module name import newJobDef job = Job.Job() job.setJobDef(newJobDef.job) job.coreCount = 0 job.workdir = self.__jobSite.workdir job.experiment = self.getExperiment() # figure out and set payload file names job.setPayloadName(self.__thisExperiment.getPayloadName(job)) # reset the default job output file list which is anyway not correct job.outFiles = [] except Exception, e: pilotErrorDiag = "Failed to process job info: %s" % str(e) tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag)) self.failJob(0, PilotErrors.ERR_UNKNOWN, job, pilotErrorDiag=pilotErrorDiag) self.__job = job # prepare for the output file data directory # (will only created for jobs that end up in a 'holding' state) self.__job.datadir = self.getParentWorkDir() + "/PandaJob_%s_data" % ( job.jobId) # See if it's an analysis job or not trf = self.__job.trf self.__analysisJob = isAnalysisJob(trf.split(",")[0]) # Setup starts here ................................................................................ # Update the job state file self.__job.jobState = "starting" self.__job.setHpcStatus('init') # Send [especially] the process group back to the pilot self.__job.setState([self.__job.jobState, 0, 0]) self.__job.jobState = self.__job.result rt = RunJobUtilities.updatePilotServer(job, self.getPilotServer(), runJob.getPilotPort()) self.__JR = JobRecovery(pshttpurl='https://pandaserver.cern.ch', pilot_initdir=self.__job.workdir) self.__JR.updateJobStateTest(self.__job, self.__jobSite, self.__node, mode="test") self.__JR.updatePandaServer(self.__job, self.__jobSite, self.__node, 25443) # prepare the setup and get the run command list ec, runCommandList, job, multi_trf = self.setup( self.__job, self.__jobSite, self.__thisExperiment) if ec != 0: tolog("!!WARNING!!2999!! runJob setup failed: %s" % (job.pilotErrorDiag)) self.failJob(0, ec, job, pilotErrorDiag=job.pilotErrorDiag) tolog("Setup has finished successfully") self.__job = job self.__runCommandList = runCommandList self.__multi_trf = multi_trf # job has been updated, display it again self.__job.displayJob() tolog("RunCommandList: %s" % self.__runCommandList) tolog("Multi_trf: %s" % self.__multi_trf)