def _parseConfigTemplate(self, templatePath, cfg=None): """Parse the ConfigTemplate.cfg files. :param str templatePath: path to the folder containing a ConfigTemplate.cfg file :param CFG cfg: cfg to merge with the systems config :returns: CFG object """ cfg = CFG() if cfg is None else cfg system = os.path.split(templatePath.rstrip("/"))[1] if system.lower().endswith('system'): system = system[:-len('System')] if self.systems and system not in self.systems: return S_OK(cfg) templatePath = os.path.join(templatePath, 'ConfigTemplate.cfg') if not os.path.exists(templatePath): return S_ERROR("File not found: %s" % templatePath) loadCfg = CFG() loadCfg.loadFromFile(templatePath) newCfg = CFG() newCfg.createNewSection("/%s" % system, contents=loadCfg) cfg = cfg.mergeWith(newCfg) return S_OK(cfg)
def _getCurrentConfig(self): """Return the current system configuration.""" from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData gConfig.forceRefresh() fullCfg = CFG() setup = gConfig.getValue('/DIRAC/Setup', '') setupList = gConfig.getSections('/DIRAC/Setups', []) if not setupList['OK']: return S_ERROR('Could not get /DIRAC/Setups sections') setupList = setupList['Value'] if setup not in setupList: return S_ERROR('Setup %s is not in allowed list: %s' % (setup, ', '.join(setupList))) serviceSetups = gConfig.getOptionsDict('/DIRAC/Setups/%s' % setup) if not serviceSetups['OK']: return S_ERROR('Could not get /DIRAC/Setups/%s options' % setup) serviceSetups = serviceSetups['Value'] # dict for system, setup in serviceSetups.items(): if self.systems and system not in self.systems: continue systemCfg = gConfigurationData.remoteCFG.getAsCFG( "/Systems/%s/%s" % (system, setup)) for section in systemCfg.listSections(): if section not in ('Agents', 'Services', 'Executors'): systemCfg.deleteKey(section) fullCfg.createNewSection("/%s" % system, contents=systemCfg) return S_OK(fullCfg)
def getSystemsCFG(self): """Find all the ConfigTemplates and collate them into one CFG object.""" cfg = CFG() cfg.createNewSection("/Systems") templateLocations = self.findConfigTemplates() for templatePath in templateLocations: cfgRes = self.parseConfigTemplate(templatePath, cfg) if cfgRes["OK"]: cfg = cfgRes["Value"] return cfg
def _updateConfiguration(self, key, value, path="/LocalSite"): """Update local configuration to be used by submitted job wrappers""" localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) section = "/" for p in path.split("/")[1:]: section = os.path.join(section, p) if not localCfg.isSection(section): localCfg.createNewSection(section) localCfg.setOption("%s/%s" % (section, key), value) localCfg.writeToFile(localConfigFile)
def getComputingElementDefaults(ceName="", ceType="", cfg=None, currentSectionPath=""): """ Return cfgDefaults with defaults for the given CEs defined either in arguments or in the provided cfg """ cesCfg = CFG() if cfg: try: cesCfg.loadFromFile(cfg) cesPath = cfgInstallPath("ComputingElements") if cesCfg.isSection(cesPath): for section in cfgPathToList(cesPath): cesCfg = cesCfg[section] except Exception: return CFG() # Overwrite the cfg with Command line arguments if ceName: if not cesCfg.isSection(ceName): cesCfg.createNewSection(ceName) if currentSectionPath: # Add Options from Command Line optionsDict = __getExtraOptions(currentSectionPath) for name, value in optionsDict.items(): cesCfg[ceName].setOption(name, value) # pylint: disable=no-member if ceType: cesCfg[ceName].setOption("CEType", ceType) # pylint: disable=no-member ceDefaultSection = cfgPath(defaultSection("ComputingElements")) # Load Default for the given type from Central configuration is defined ceDefaults = __gConfigDefaults(ceDefaultSection) for ceName in cesCfg.listSections(): if "CEType" in cesCfg[ceName]: ceType = cesCfg[ceName]["CEType"] if ceType in ceDefaults: for option in ceDefaults[ceType].listOptions(): # pylint: disable=no-member if option not in cesCfg[ceName]: cesCfg[ceName].setOption( option, ceDefaults[ceType][option] # pylint: disable=unsubscriptable-object ) return cesCfg
def __gConfigDefaults(defaultPath): """ Build a cfg from a Default Section """ from DIRAC import gConfig cfgDefaults = CFG() result = gConfig.getSections(defaultPath) if not result['OK']: return cfgDefaults for name in result['Value']: typePath = cfgPath(defaultPath, name) cfgDefaults.createNewSection(name) result = gConfig.getOptionsDict(typePath) if result['OK']: optionsDict = result['Value'] for option, value in optionsDict.items(): cfgDefaults[name].setOption(option, value) return cfgDefaults
class JobRepository(object): def __init__(self, repository=None): self.location = repository if not self.location: if "HOME" in os.environ: self.location = '%s/.dirac.repo.rep' % os.environ['HOME'] else: self.location = '%s/.dirac.repo.rep' % os.getcwd() self.repo = CFG() if os.path.exists(self.location): self.repo.loadFromFile(self.location) if not self.repo.existsKey('Jobs'): self.repo.createNewSection('Jobs') else: self.repo.createNewSection('Jobs') self.OK = True written = self._writeRepository(self.location) if not written: self.OK = False def isOK(self): return self.OK def readRepository(self): return S_OK(self.repo.getAsDict('Jobs')) def writeRepository(self, alternativePath=None): destination = self.location if alternativePath: destination = alternativePath written = self._writeRepository(destination) if not written: return S_ERROR("Failed to write repository") return S_OK(destination) def resetRepository(self, jobIDs=[]): if not jobIDs: jobs = self.readRepository()['Value'] jobIDs = list(jobs) paramDict = {'State': 'Submitted', 'Retrieved': 0, 'OutputData': 0} for jobID in jobIDs: self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _writeRepository(self, path): handle, tmpName = tempfile.mkstemp() written = self.repo.writeToFile(tmpName) os.close(handle) if not written: if os.path.exists(tmpName): os.remove(tmpName) return written if os.path.exists(path): gLogger.debug("Replacing %s" % path) try: shutil.move(tmpName, path) return True except Exception as x: gLogger.error("Failed to overwrite repository.", x) gLogger.info( "If your repository is corrupted a backup can be found %s" % tmpName) return False def appendToRepository(self, repoLocation): if not os.path.exists(repoLocation): gLogger.error("Secondary repository does not exist", repoLocation) return S_ERROR("Secondary repository does not exist") self.repo = CFG().loadFromFile(repoLocation).mergeWith(self.repo) self._writeRepository(self.location) return S_OK() def addJob(self, jobID, state='Submitted', retrieved=0, outputData=0, update=False): paramDict = { 'State': state, 'Time': self._getTime(), 'Retrieved': int(retrieved), 'OutputData': outputData } self._writeJob(jobID, paramDict, update) self._writeRepository(self.location) return S_OK(jobID) def updateJob(self, jobID, paramDict): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def updateJobs(self, jobDict): for jobID, paramDict in jobDict.items(): if self._existsJob(jobID): paramDict['Time'] = self._getTime() self._writeJob(jobID, paramDict, True) self._writeRepository(self.location) return S_OK() def _getTime(self): runtime = time.ctime() return runtime.replace(" ", "_") def _writeJob(self, jobID, paramDict, update): jobID = str(jobID) jobExists = self._existsJob(jobID) if jobExists and (not update): gLogger.warn("Job exists and not overwriting") return S_ERROR("Job exists and not overwriting") if not jobExists: self.repo.createNewSection('Jobs/%s' % jobID) for key, value in paramDict.items(): self.repo.setOption('Jobs/%s/%s' % (jobID, key), value) return S_OK() def removeJob(self, jobID): res = self.repo['Jobs'].deleteKey(str(jobID)) # pylint: disable=no-member if res: self._writeRepository(self.location) return S_OK() def existsJob(self, jobID): return S_OK(self._existsJob(jobID)) def _existsJob(self, jobID): return self.repo.isSection('Jobs/%s' % jobID) def getLocation(self): return S_OK(self.location) def getSize(self): return S_OK(len(self.repo.getAsDict('Jobs')))
class JobManifest(object): def __init__(self, manifest=""): self.__manifest = CFG() self.__dirty = False self.__ops = False if manifest: result = self.load(manifest) if not result["OK"]: raise Exception(result["Message"]) def isDirty(self): return self.__dirty def setDirty(self): self.__dirty = True def clearDirty(self): self.__dirty = False def load(self, dataString): """ Auto discover format type based on [ .. ] of JDL """ dataString = dataString.strip() if dataString[0] == "[" and dataString[-1] == "]": return self.loadJDL(dataString) else: return self.loadCFG(dataString) def loadJDL(self, jdlString): """ Load job manifest from JDL format """ result = loadJDLAsCFG(jdlString.strip()) if not result["OK"]: self.__manifest = CFG() return result self.__manifest = result["Value"][0] return S_OK() def loadCFG(self, cfgString): """ Load job manifest from CFG format """ try: self.__manifest.loadFromBuffer(cfgString) except Exception as e: return S_ERROR("Can't load manifest from cfg: %s" % str(e)) return S_OK() def dumpAsCFG(self): return str(self.__manifest) def getAsCFG(self): return self.__manifest.clone() def dumpAsJDL(self): return dumpCFGAsJDL(self.__manifest) def __getCSValue(self, varName, defaultVal=None): if not self.__ops: self.__ops = Operations(group=self.__manifest["OwnerGroup"], setup=self.__manifest["DIRACSetup"]) if varName[0] != "/": varName = "JobDescription/%s" % varName return self.__ops.getValue(varName, defaultVal) def __checkNumericalVar(self, varName, defaultVal, minVal, maxVal): """ Check a numerical var """ initialVal = False if varName not in self.__manifest: varValue = self.__getCSValue("Default%s" % varName, defaultVal) else: varValue = self.__manifest[varName] initialVal = varValue try: varValue = int(varValue) except ValueError: return S_ERROR("%s must be a number" % varName) minVal = self.__getCSValue("Min%s" % varName, minVal) maxVal = self.__getCSValue("Max%s" % varName, maxVal) varValue = max(minVal, min(varValue, maxVal)) if initialVal != varValue: self.__manifest.setOption(varName, varValue) return S_OK(varValue) def __checkChoiceVar(self, varName, defaultVal, choices): """ Check a choice var """ initialVal = False if varName not in self.__manifest: varValue = self.__getCSValue("Default%s" % varName, defaultVal) else: varValue = self.__manifest[varName] initialVal = varValue if varValue not in self.__getCSValue("Choices%s" % varName, choices): return S_ERROR("%s is not a valid value for %s" % (varValue, varName)) if initialVal != varValue: self.__manifest.setOption(varName, varValue) return S_OK(varValue) def __checkMultiChoice(self, varName, choices): """ Check a multi choice var """ initialVal = False if varName not in self.__manifest: return S_OK() else: varValue = self.__manifest[varName] initialVal = varValue choices = self.__getCSValue("Choices%s" % varName, choices) for v in List.fromChar(varValue): if v not in choices: return S_ERROR("%s is not a valid value for %s" % (v, varName)) if initialVal != varValue: self.__manifest.setOption(varName, varValue) return S_OK(varValue) def __checkMaxInputData(self, maxNumber): """ Check Maximum Number of Input Data files allowed """ varName = "InputData" if varName not in self.__manifest: return S_OK() varValue = self.__manifest[varName] if len(List.fromChar(varValue)) > maxNumber: return S_ERROR( "Number of Input Data Files (%s) greater than current limit: %s" % (len(List.fromChar(varValue)), maxNumber)) return S_OK() def __contains__(self, key): """Check if the manifest has the required key""" return key in self.__manifest def setOptionsFromDict(self, varDict): for k in sorted(varDict): self.setOption(k, varDict[k]) def check(self): """ Check that the manifest is OK """ for k in ["OwnerName", "OwnerDN", "OwnerGroup", "DIRACSetup"]: if k not in self.__manifest: return S_ERROR("Missing var %s in manifest" % k) # Check CPUTime result = self.__checkNumericalVar("CPUTime", 86400, 100, 500000) if not result["OK"]: return result result = self.__checkNumericalVar("Priority", 1, 0, 10) if not result["OK"]: return result maxInputData = Operations().getValue("JobDescription/MaxInputData", 500) result = self.__checkMaxInputData(maxInputData) if not result["OK"]: return result operation = Operations(group=self.__manifest["OwnerGroup"]) allowedJobTypes = operation.getValue("JobDescription/AllowedJobTypes", ["User", "Test", "Hospital"]) transformationTypes = operation.getValue( "Transformations/DataProcessing", []) result = self.__checkMultiChoice("JobType", allowedJobTypes + transformationTypes) if not result["OK"]: return result return S_OK() def createSection(self, secName, contents=False): if secName not in self.__manifest: if contents and not isinstance(contents, CFG): return S_ERROR("Contents for section %s is not a cfg object" % secName) self.__dirty = True return S_OK( self.__manifest.createNewSection(secName, contents=contents)) return S_ERROR("Section %s already exists" % secName) def getSection(self, secName): self.__dirty = True if secName not in self.__manifest: return S_ERROR("%s does not exist" % secName) sec = self.__manifest[secName] if not sec: return S_ERROR("%s section empty" % secName) return S_OK(sec) def setSectionContents(self, secName, contents): if contents and not isinstance(contents, CFG): return S_ERROR("Contents for section %s is not a cfg object" % secName) self.__dirty = True if secName in self.__manifest: self.__manifest[secName].reset() self.__manifest[secName].mergeWith(contents) else: self.__manifest.createNewSection(secName, contents=contents) def setOption(self, varName, varValue): """ Set a var in job manifest """ self.__dirty = True levels = List.fromChar(varName, "/") cfg = self.__manifest for l in levels[:-1]: if l not in cfg: cfg.createNewSection(l) cfg = cfg[l] cfg.setOption(levels[-1], varValue) def remove(self, opName): levels = List.fromChar(opName, "/") cfg = self.__manifest for l in levels[:-1]: if l not in cfg: return S_ERROR("%s does not exist" % opName) cfg = cfg[l] if cfg.deleteKey(levels[-1]): self.__dirty = True return S_OK() return S_ERROR("%s does not exist" % opName) def getOption(self, varName, defaultValue=None): """ Get a variable from the job manifest """ cfg = self.__manifest return cfg.getOption(varName, defaultValue) def getOptionList(self, section=""): """ Get a list of variables in a section of the job manifest """ cfg = self.__manifest.getRecursive(section) if not cfg or "value" not in cfg: return [] cfg = cfg["value"] return cfg.listOptions() def isOption(self, opName): """ Check if it is a valid option """ return self.__manifest.isOption(opName) def getSectionList(self, section=""): """ Get a list of sections in the job manifest """ cfg = self.__manifest.getRecursive(section) if not cfg or "value" not in cfg: return [] cfg = cfg["value"] return cfg.listSections()
localConfigFile = os.path.expandvars( "$WORKSPACE") + "/PilotInstallDIR/etc/dirac.cfg" elif os.path.isfile( os.path.expandvars("$WORKSPACE") + "/ServerInstallDIR/etc/dirac.cfg"): localConfigFile = os.path.expandvars( "$WORKSPACE") + "/ServerInstallDIR/etc/dirac.cfg" elif os.path.isfile("./etc/dirac.cfg"): localConfigFile = "./etc/dirac.cfg" else: print("Local CFG file not found") exit(2) localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", 5000) localCfg.setOption("/DIRAC/Security/UseServerCertificate", False) if not sMod: if not setup: setup = gConfig.getValue("/DIRAC/Setup") if not setup: setup = "dirac-JenkinsSetup" if not localCfg.isSection("/Operations"): localCfg.createNewSection("/Operations") if not localCfg.isSection("/Operations/%s" % setup): localCfg.createNewSection("/Operations/%s" % setup) localCfg.setOption("/Operations/%s/SoftwareDistModule" % setup, "")
localConfigFile = os.path.expandvars( '$WORKSPACE') + '/PilotInstallDIR/etc/dirac.cfg' elif os.path.isfile( os.path.expandvars('$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg'): localConfigFile = os.path.expandvars( '$WORKSPACE') + '/ServerInstallDIR/etc/dirac.cfg' elif os.path.isfile('./etc/dirac.cfg'): localConfigFile = './etc/dirac.cfg' else: print("Local CFG file not found") exit(2) localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', 5000) localCfg.setOption('/DIRAC/Security/UseServerCertificate', False) if not sMod: if not setup: setup = gConfig.getValue('/DIRAC/Setup') if not setup: setup = 'dirac-JenkinsSetup' if not vo: vo = gConfig.getValue('/DIRAC/VirtualOrganization') if not vo: vo = 'dirac' if not localCfg.isSection('/DIRAC/VOPolicy'): localCfg.createNewSection('/DIRAC/VOPolicy')
def execute(self): """The JobAgent execution method. """ # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Check if we can match jobs at all self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', ': %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) self.log.info( 'CE is not available (and there are no running jobs)') return self.__finish('CE Not Available') if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: self.timeLeft = self.computeCPUWorkLeft() self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') # if we are here we assume that a job can be matched result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: # if we don't match a job, independently from the reason, # we wait a bit longer before trying again self.am_setOption("PollingTime", int(self.am_getOption("PollingTime") * 1.5)) if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 # If we are here it is because we matched a job matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: jobReport.setJobStatus(status='Failed', minor='Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: jobReport.setJobStatus(status='Failed', minor='Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: jobReport.setJobStatus(status='Failed', minor='Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' jobReport.setJobStatus(status='Failed', minor=msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions and 'dirac-jobexec' in params.get( 'Executable', '').strip(): params['Arguments'] = (params.get('Arguments', '') + ' ' + self.extraOptions).strip() params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name='MatcherServiceTime', par_value=str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus(status='Matched', minor='Job Received by Agent', sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy['OK']: return self._rescheduleFailedJob(jobID, result_setupProxy['Message'], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=processors, wholeNode=wholeNode, maxNumberOfProcessors=maxNumberOfProcessors, mpTag=mpTag) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res['OK']: resFD = jobReport.generateForwardDISET() if not resFD['OK']: self.log.error("Error generating ForwardDISET operation", resFD['Message']) else: # Here we create the Request. op = resFD['Value'] request = Request() requestName = 'jobAgent_%s' % jobID request.RequestName = requestName.replace('"', '') request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob['OK']: return self.__finish(result_submitJob['Message']) elif 'PayloadFailed' in result_submitJob: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result_submitJob[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) return S_OK('Job Agent cycle complete')
def loadJDLAsCFG(jdl): """ Load a JDL as CFG """ def cleanValue(value): value = value.strip() if value[0] == '"': entries = [] iPos = 1 current = "" state = "in" while iPos < len(value): if value[iPos] == '"': if state == "in": entries.append(current) current = "" state = "out" elif state == "out": current = current.strip() if current not in (",", ): return S_ERROR( "value seems a list but is not separated in commas" ) current = "" state = "in" else: current += value[iPos] iPos += 1 if state == "in": return S_ERROR('value is opened with " but is not closed') return S_OK(", ".join(entries)) else: return S_OK(value.replace('"', '')) def assignValue(key, value, cfg): key = key.strip() if len(key) == 0: return S_ERROR("Invalid key name") value = value.strip() if not value: return S_ERROR("No value for key %s" % key) if value[0] == "{": if value[-1] != "}": return S_ERROR( "Value '%s' seems a list but does not end in '}'" % (value)) valList = List.fromChar(value[1:-1]) for i in range(len(valList)): result = cleanValue(valList[i]) if not result['OK']: return S_ERROR("Var %s : %s" % (key, result['Message'])) valList[i] = result['Value'] if valList[i] is None: return S_ERROR( "List value '%s' seems invalid for item %s" % (value, i)) value = ", ".join(valList) else: result = cleanValue(value) if not result['OK']: return S_ERROR("Var %s : %s" % (key, result['Message'])) nV = result['Value'] if nV is None: return S_ERROR("Value '%s seems invalid" % (value)) value = nV cfg.setOption(key, value) return S_OK() if jdl[0] == "[": iPos = 1 else: iPos = 0 key = "" value = "" action = "key" insideLiteral = False cfg = CFG() while iPos < len(jdl): char = jdl[iPos] if char == ";" and not insideLiteral: if key.strip(): result = assignValue(key, value, cfg) if not result['OK']: return result key = "" value = "" action = "key" elif char == "[" and not insideLiteral: key = key.strip() if not key: return S_ERROR("Invalid key in JDL") if value.strip(): return S_ERROR( "Key %s seems to have a value and open a sub JDL at the same time" % key) result = loadJDLAsCFG(jdl[iPos:]) if not result['OK']: return result subCfg, subPos = result['Value'] cfg.createNewSection(key, contents=subCfg) key = "" value = "" action = "key" insideLiteral = False iPos += subPos elif char == "=" and not insideLiteral: if action == "key": action = "value" insideLiteral = False else: value += char elif char == "]" and not insideLiteral: key = key.strip() if len(key) > 0: result = assignValue(key, value, cfg) if not result['OK']: return result return S_OK((cfg, iPos)) else: if action == "key": key += char else: value += char if char == '"': insideLiteral = not insideLiteral iPos += 1 return S_OK((cfg, iPos))