def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict[ 'SubmitPool' ] = self.am_getOption( "SubmitPools" ) #Add all DIRAC platforms if not specified otherwise if not 'Platform' in self.directorDict: result = gConfig.getOptionsDict( '/Resources/Computing/OSCompatibility' ) if result['OK']: self.directorDict['Platform'] = result['Value'].keys() rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( self.directorDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) ) if not taskQueueDict: self.log.info( 'No TaskQueue to Process' ) return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info( 'Sum of Priorities %s' % prioritySum ) if waitingJobs == 0: self.log.info( 'No waiting Jobs' ) return S_OK( 'No waiting Jobs' ) if prioritySum <= 0: return S_ERROR( 'Wrong TaskQueue Priorities' ) self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) ) for taskQueueID in taskQueueDict: self.log.verbose( 'Processing TaskQueue', taskQueueID ) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList}, None, timeLimitToConsider ) if not result['OK']: self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] ) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots ) result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots ) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots ) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots ) return S_OK()
def _submitPilots( self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob ): """ This method does the actual pilot submission to the DIRAC CE The logic is as follows: - If there are no available CE it return error - If there is no queue available in the CE's, it returns error - It creates a temp directory - It prepare a PilotScript """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] submittedPilots = 0 # if self.computingElement not in self.computingElementDict: # # Since we can exclude CEs from the list, it may become empty # return S_ERROR( ERROR_CE ) pilotRequirements = [] pilotRequirements.append( ( 'CPUTime', taskQueueDict['CPUTime'] ) ) # do we need to care about anything else? pilotRequirementsString = str( pilotRequirements ) # Check that there are available queues for the Jobs: if self.enableListMatch: availableQueues = [] # now = Time.dateTime() cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString ) if cachedAvailableQueues == False: availableQueues = self._listQueues( pilotRequirements ) if availableQueues != False: self.listMatchCache.add( pilotRequirementsString, self.listMatchDelay, availableQueues ) self.log.verbose( 'Available Queues for TaskQueue ', "%s: %s" % ( taskQueueID, str(availableQueues) ) ) else: availableQueues = cachedAvailableQueues if not availableQueues: return S_ERROR( ERROR_CE + ' TQ: %d' % taskQueueID ) baseDir = os.getcwd() workingDirectory = tempfile.mkdtemp( prefix= 'TQ_%s_' % taskQueueID, dir = workDir ) self.log.verbose( 'Using working Directory:', workingDirectory ) os.chdir( workingDirectory ) # set the Site Name pilotOptions.append( "-n '%s'" % self.siteName) # submit pilots for every CE available for CE in self.computingElementDict.keys(): ceName = CE computingElement = self.computingElementDict[CE]['CE'] # add possible requirements from Site and CE for req, val in getResourceDict( ceName ).items(): pilotOptions.append( "-o '/AgentJobRequirements/%s=%s'" % ( req, val ) ) ceConfigDict = self.computingElementDict[CE] if 'ClientPlatform' in ceConfigDict: pilotOptions.append( "-p '%s'" % ceConfigDict['ClientPlatform']) if 'SharedArea' in ceConfigDict: pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea'] ) # if 'CPUScalingFactor' in ceConfigDict: # pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor'] ) # # if 'CPUNormalizationFactor' in ceConfigDict: # pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor'] ) self.log.info( "pilotOptions: ", ' '.join(pilotOptions)) httpProxy = '' if 'HttpProxy' in ceConfigDict: httpProxy = ceConfigDict['HttpProxy'] if 'JobExecDir' in ceConfigDict: pilotExecDir = ceConfigDict['JobExecDir'] try: pilotScript = self._writePilotScript( workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir ) except: self.log.exception( ERROR_SCRIPT ) try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_ERROR( ERROR_SCRIPT ) self.log.info("Pilots to submit: ", pilotsToSubmit) while submittedPilots < pilotsToSubmit: # Find out how many pilots can be submitted ret = computingElement.available( ) if not ret['OK']: self.log.error('Can not determine if pilot should be submitted: ', ret['Message']) break maxPilotsToSubmit = ret['Value'] self.log.info("Submit Pilots: ", maxPilotsToSubmit) if not maxPilotsToSubmit: break # submit the pilots and then check again for _i in range( min( maxPilotsToSubmit, pilotsToSubmit - submittedPilots ) ): submission = computingElement.submitJob(pilotScript, '', '') if not submission['OK']: self.log.error('Pilot submission failed: ', submission['Message']) # cleanup try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful') submittedPilots += 1 # let the batch system some time to digest the submitted job time.sleep(1) #next CE try: os.chdir( baseDir ) shutil.rmtree( workingDirectory ) except: pass return S_OK(submittedPilots)
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() # Add all submit pools self.directorDict["SubmitPool"] = self.am_getOption("SubmitPools") # Add all DIRAC platforms if not specified otherwise if not "Platform" in self.directorDict: result = getDIRACPlatforms() if result["OK"]: self.directorDict["Platform"] = result["Value"] rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] self.log.info("Found %s TaskQueues" % len(taskQueueDict)) if not taskQueueDict: self.log.info("No TaskQueue to Process") return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]["TaskQueueID"] = taskQueueID prioritySum += taskQueueDict[taskQueueID]["Priority"] waitingJobs += taskQueueDict[taskQueueID]["Jobs"] self.log.info("Sum of Priorities %s" % prioritySum) if waitingJobs == 0: self.log.info("No waiting Jobs") return S_OK("No waiting Jobs") if prioritySum <= 0: return S_ERROR("Wrong TaskQueue Priorities") self.pilotsPerPriority = self.am_getOption("pilotsPerIteration") / prioritySum self.pilotsPerJob = self.am_getOption("pilotsPerIteration") / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ["Submitted", "Ready", "Scheduled", "Waiting"] timeLimitToConsider = Time.toString(Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose("Processing TaskQueue", taskQueueID) result = pilotAgentsDB.countPilots( {"TaskQueueID": taskQueueID, "Status": waitingStatusList}, None, timeLimitToConsider ) if not result["OK"]: self.log.error("Fail to get Number of Waiting pilots", result["Message"]) waitingPilots = 0 else: waitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result["OK"]: self.toSubmitPilots += result["Value"] self.log.info("Number of pilots to be Submitted %s" % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if "Default" in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools["Default"].processAllResults() self.log.info("Number of pilots Submitted %s" % self.submittedPilots) return S_OK()
def _submitPilots(self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob): """ This method does the actual pilot submission to the DIRAC CE The logic is as follows: - If there are no available CE it return error - If there is no queue available in the CE's, it returns error - It creates a temp directory - It prepare a PilotScript """ taskQueueID = taskQueueDict['TaskQueueID'] ownerDN = taskQueueDict['OwnerDN'] submittedPilots = 0 # if self.computingElement not in self.computingElementDict: # # Since we can exclude CEs from the list, it may become empty # return S_ERROR( ERROR_CE ) pilotRequirements = [] pilotRequirements.append(('CPUTime', taskQueueDict['CPUTime'])) # do we need to care about anything else? pilotRequirementsString = str(pilotRequirements) # Check that there are available queues for the Jobs: if self.enableListMatch: availableQueues = [] # now = Time.dateTime() cachedAvailableQueues = self.listMatchCache.get( pilotRequirementsString) if cachedAvailableQueues == False: availableQueues = self._listQueues(pilotRequirements) if availableQueues != False: self.listMatchCache.add(pilotRequirementsString, self.listMatchDelay, availableQueues) self.log.verbose( 'Available Queues for TaskQueue ', "%s: %s" % (taskQueueID, str(availableQueues))) else: availableQueues = cachedAvailableQueues if not availableQueues: return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID) baseDir = os.getcwd() workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID, dir=workDir) self.log.verbose('Using working Directory:', workingDirectory) os.chdir(workingDirectory) # set the Site Name pilotOptions.append("-n '%s'" % self.siteName) # submit pilots for every CE available for CE in self.computingElementDict.keys(): ceName = CE computingElement = self.computingElementDict[CE]['CE'] # add possible requirements from Site and CE for req, val in getResourceDict(ceName).items(): pilotOptions.append("-o '/AgentJobRequirements/%s=%s'" % (req, val)) ceConfigDict = self.computingElementDict[CE] if 'ClientPlatform' in ceConfigDict: pilotOptions.append("-p '%s'" % ceConfigDict['ClientPlatform']) if 'SharedArea' in ceConfigDict: pilotOptions.append("-o '/LocalSite/SharedArea=%s'" % ceConfigDict['SharedArea']) if 'CPUScalingFactor' in ceConfigDict: pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % ceConfigDict['CPUScalingFactor']) if 'CPUNormalizationFactor' in ceConfigDict: pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % ceConfigDict['CPUNormalizationFactor']) self.log.info("pilotOptions: ", ' '.join(pilotOptions)) httpProxy = '' if 'HttpProxy' in ceConfigDict: httpProxy = ceConfigDict['HttpProxy'] pilotDir = '' if 'JobExecDir' in ceConfigDict: pilotExecDir = ceConfigDict['JobExecDir'] try: pilotScript = self._writePilotScript(workingDirectory, pilotOptions, proxy, httpProxy, pilotExecDir) except: self.log.exception(ERROR_SCRIPT) try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_SCRIPT) self.log.info("Pilots to submit: ", pilotsToSubmit) while submittedPilots < pilotsToSubmit: # Find out how many pilots can be submitted ret = computingElement.available() if not ret['OK']: self.log.error( 'Can not determine if pilot should be submitted: ', ret['Message']) break maxPilotsToSubmit = ret['Value'] self.log.info("Submit Pilots: ", maxPilotsToSubmit) if not maxPilotsToSubmit: break # submit the pilots and then check again for i in range( min(maxPilotsToSubmit, pilotsToSubmit - submittedPilots)): submission = computingElement.submitJob( pilotScript, '', '') if not submission['OK']: self.log.error('Pilot submission failed: ', submission['Message']) # cleanup try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_ERROR('Pilot submission failed after ' + str(submittedPilots) + ' pilots submitted successful') submittedPilots += 1 # let the batch system some time to digest the submitted job time.sleep(1) #next CE try: os.chdir(baseDir) shutil.rmtree(workingDirectory) except: pass return S_OK(submittedPilots)
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() #Add all submit pools self.directorDict['SubmitPool'] = self.am_getOption("SubmitPools") rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()