def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Count Pending Jobs 3.- Submit VMs """ self.__checkSubmitPools() imagesToSubmit = {} for directorName, directorDict in self.directors.items(): self.log.verbose( 'Checking Director:', directorName ) for imageName in directorDict['director'].images: imageDict = directorDict['director'].images[imageName] instances = 0 result = virtualMachineDB.getInstancesByStatus( 'Running' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatus( 'Submitted' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) self.log.verbose( 'Checking Image %s:' % imageName, instances ) maxInstances = imageDict['MaxInstances'] if instances >= maxInstances: self.log.info( '%s >= %s Running instances of %s, skipping' % ( instances, maxInstances, imageName ) ) continue imageRequirementsDict = imageDict['RequirementsDict'] result = taskQueueDB.getMatchingTaskQueues( imageRequirementsDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] jobs = 0 priority = 0 cpu = 0 for tq in taskQueueDict: jobs += taskQueueDict[tq]['Jobs'] priority += taskQueueDict[tq]['Priority'] cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime'] if not jobs: self.log.info( 'No matching jobs for %s found, skipping' % imageName ) continue if instances and ( cpu / instances ) < imageDict['CPUPerInstance']: self.log.info( 'Waiting CPU per Running instance %s < %s, skipping' % ( cpu / instances, imageDict['CPUPerInstance'] ) ) continue if directorName not in imagesToSubmit: imagesToSubmit[directorName] = {} if imageName not in imagesToSubmit[directorName]: imagesToSubmit[directorName][imageName] = {} imagesToSubmit[directorName][imageName] = { 'Jobs': jobs, 'TQPriority': priority, 'CPUTime': cpu, 'VMPriority': imageDict['Priority'] } for directorName, imageDict in imagesToSubmit.items(): for imageName, jobsDict in imageDict.items(): if self.directors[directorName]['isEnabled']: self.log.info( 'Requesting submission of %s to %s' % ( imageName, directorName ) ) director = self.directors[directorName]['director'] pool = self.pools[self.directors[directorName]['pool']] ret = pool.generateJobAndQueueIt( director.submitInstance, args=( imageName, self.workDir ), oCallback=self.callBack, oExceptionCallback=director.exceptionCallBack, blocking=False ) if not ret['OK']: # Disable submission until next iteration self.directors[directorName]['isEnabled'] = False else: time.sleep( self.am_getOption( 'ThreadStartDelay' ) ) if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() return DIRAC.S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: return S_ERROR( 'CPU time limit is not specified for queue %s' % queue ) # Get the working proxy cpuTime = queueCPUTime + 86400 result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: %s' % ( queue, result['Message'] ) ) continue totalSlots = result['Value'] self.log.verbose( result['Message'] ) ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] result = taskQueueDB.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found' ) continue totalTQJobs = 0 for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % ( totalSlots, totalTQJobs, pilotsToSubmit ) ) if pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy ) if not result['OK']: return result # If proxy is not bundled in, submit with the user proxy executable = result['Executable'] proxy = result['Proxy'] result = ce.submitJob( executable, proxy, pilotsToSubmit ) if not result['OK']: self.log.error( 'Failed submission to queue %s:' % queue, result['Message'] ) # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.genericPilotDN, self.genericPilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfuly submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: %s' % result['Message'] ) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: return S_ERROR('CPU time limit is not specified for queue %s' % queue) if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: %s' % (queue, result['Message'])) continue totalSlots = result['Value'] self.log.verbose(result['Message']) ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] result = taskQueueDB.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found') continue totalTQJobs = 0 for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % (totalSlots, totalTQJobs, pilotsToSubmit)) if pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy) if not result['OK']: return result executable = result['Value'] result = ce.submitJob(executable, '', pilotsToSubmit) if not result['OK']: self.log.error('Failed submission to queue %s:' % queue, result['Message']) continue # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.genericPilotDN, self.genericPilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfuly submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: %s' % result['Message']) continue return S_OK()
def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() result = taskQueueDB.getMatchingTaskQueues( self.directorDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Found %s TaskQueues' % len( taskQueueDict ) ) if not taskQueueDict: self.log.info( 'No TaskQueue to Process' ) return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info( 'Sum of Priorities %s' % prioritySum ) if waitingJobs == 0: self.log.info( 'No waiting Jobs' ) return S_OK( 'No waiting Jobs' ) if prioritySum <= 0: return S_ERROR( 'Wrong TaskQueue Priorities' ) self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration' ) / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration' ) / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption( "maxPilotWaitingHours" ) ) for taskQueueID in taskQueueDict: self.log.verbose( 'Processing TaskQueue', taskQueueID ) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList}, None, timeLimitToConsider ) if not result['OK']: self.log.error( 'Fail to get Number of Waiting pilots', result['Message'] ) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots ) result = self.submitPilotsForTaskQueue( taskQueueDict[taskQueueID], waitingPilots ) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info( 'Number of pilots to be Submitted %s' % self.toSubmitPilots ) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info( 'Number of pilots Submitted %s' % self.submittedPilots ) return S_OK()
def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Count Pending Jobs 3.- Submit Jobs """ self.__checkSubmitPools() bigDataJobsToSubmit = {} bigDataJobIdsToSubmit = {} for directorName, directorDict in self.directors.items(): self.log.verbose( 'Checking Director:', directorName ) self.log.verbose( 'RunningEndPoints:', directorDict['director'].runningEndPoints ) for runningEndPointName in directorDict['director'].runningEndPoints: runningEndPointDict = directorDict['director'].runningEndPoints[runningEndPointName] NameNode = runningEndPointDict['NameNode'] jobsByEndPoint = 0 result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Submitted', NameNode ) if result['OK']: jobsByEndPoint += len( result['Value'] ) result = BigDataDB.getBigDataJobsByStatusAndEndpoint( 'Running', NameNode ) if result['OK']: jobsByEndPoint += len( result['Value'] ) self.log.verbose( 'Checking Jobs By EndPoint %s:' % jobsByEndPoint ) jobLimitsEndPoint = runningEndPointDict['LimitQueueJobsEndPoint'] bigDataJobs = 0 if jobsByEndPoint >= jobLimitsEndPoint: self.log.info( '%s >= %s Running jobs reach job limits: %s, skipping' % ( jobsByEndPoint, jobLimitsEndPoint, runningEndPointName ) ) continue else: bigDataJobs = jobLimitsEndPoint - jobsByEndPoint requirementsDict = runningEndPointDict['Requirements'] self.log.info( 'Requirements Dict: ', requirementsDict ) result = taskQueueDB.getMatchingTaskQueues( requirementsDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.info( 'Task Queues Dict: ', taskQueueDict ) jobs = 0 priority = 0 cpu = 0 jobsID = 0 self.log.info( 'Pending Jobs from TaskQueue, which not matching before: ', self.pendingTaskQueueJobs ) for tq in taskQueueDict: jobs += taskQueueDict[tq]['Jobs'] priority += taskQueueDict[tq]['Priority'] cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime'] #Matching of Jobs with BigData Softwares #This process is following the sequence: #Retrieve a job from taskqueueDict #Get job name and try to match with the resources #If not match store the var pendingTaskQueueJobs for the #next iteration # #This matching is doing with the following JobName Pattern # NameSoftware _ SoftwareVersion _ HighLanguageName _ HighLanguageVersion _ DataSetName #extract a job from the TaskQueue if tq not in self.pendingTaskQueueJobs.keys(): self.pendingTaskQueueJobs[tq] = {} getJobFromTaskQueue = taskQueueDB.matchAndGetJob( taskQueueDict[tq] ) if not getJobFromTaskQueue['OK']: self.log.error( 'Could not get Job and FromTaskQueue', getJobFromTaskQueue['Message'] ) return getJobFromTaskQueue jobInfo = getJobFromTaskQueue['Value'] jobID = jobInfo['jobId'] jobAttrInfo = jobDB.getJobAttributes( jobID ) if not jobAttrInfo['OK']: self.log.error( 'Could not get Job Attributes', jobAttrInfo['Message'] ) return jobAttrInfo jobInfoUniq = jobAttrInfo['Value'] jobName = jobInfoUniq['JobName'] self.pendingTaskQueueJobs[tq][jobID] = jobName result = jobDB.getJobJDL( jobID, True ) classAdJob = ClassAd( result['Value'] ) arguments = 0 if classAdJob.lookupAttribute( 'Arguments' ): arguments = classAdJob.getAttributeString( 'Arguments' ) #if not classAdJob.lookupAttribute( 'Arguments' ): # continue jobsToSubmit = self.matchingJobsForBDSubmission( arguments, runningEndPointName, runningEndPointDict['BigDataSoftware'], runningEndPointDict['BigDataSoftwareVersion'], runningEndPointDict['HighLevelLanguage']['HLLName'], runningEndPointDict['HighLevelLanguage']['HLLVersion'], jobID ) if ( jobsToSubmit == "OK" ): if directorName not in bigDataJobsToSubmit: bigDataJobsToSubmit[directorName] = {} if runningEndPointName not in bigDataJobsToSubmit[directorName]: bigDataJobsToSubmit[directorName][runningEndPointName] = {} bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobID, 'JobName': jobName, 'TQPriority': priority, 'CPUTime': cpu, 'BigDataEndpoint': runningEndPointName, 'BigDataEndpointNameNode': runningEndPointDict['NameNode'], 'BdSoftware': runningEndPointDict['BigDataSoftware'], 'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'], 'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'], 'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'], 'NumBigDataJobsAllowedToSubmit': bigDataJobs, 'SiteName': runningEndPointDict['SiteName'], 'PublicIP': runningEndPointDict['PublicIP'], 'User': runningEndPointDict['User'], 'Port': runningEndPointDict['Port'], 'UsePilot': runningEndPointDict['UsePilot'], 'IsInteractive': runningEndPointDict['IsInteractive'], 'Arguments': arguments } del self.pendingTaskQueueJobs[tq][jobID] else: self.log.error( jobsToSubmit ) self.log.info( 'Pending Jobs from TaskQueue, which not matching after: ', self.pendingTaskQueueJobs ) for tq in self.pendingTaskQueueJobs.keys(): for jobid in self.pendingTaskQueueJobs[tq].keys(): result = jobDB.getJobJDL( jobid, True ) classAdJob = ClassAd( result['Value'] ) arguments = 0 if classAdJob.lookupAttribute( 'Arguments' ): arguments = classAdJob.getAttributeString( 'Arguments' ) #if not classAdJob.lookupAttribute( 'Arguments' ): # continue #do the match with the runningEndPoint jobsToSubmit = self.matchingJobsForBDSubmission( arguments, runningEndPointName, runningEndPointDict['BigDataSoftware'], runningEndPointDict['BigDataSoftwareVersion'], runningEndPointDict['HighLevelLanguage']['HLLName'], runningEndPointDict['HighLevelLanguage']['HLLVersion'], jobid ) if ( jobsToSubmit == "OK" ): if directorName not in bigDataJobsToSubmit: bigDataJobsToSubmit[directorName] = {} if runningEndPointName not in bigDataJobsToSubmit[directorName]: bigDataJobsToSubmit[directorName][runningEndPointName] = {} bigDataJobsToSubmit[directorName][runningEndPointName] = { 'JobId': jobid, 'JobName': self.pendingTaskQueueJobs[tq][jobid], 'TQPriority': priority, 'CPUTime': cpu, 'BigDataEndpoint': runningEndPointName, 'BigDataEndpointNameNode': runningEndPointDict['NameNode'], 'BdSoftware': runningEndPointDict['BigDataSoftware'], 'BdSoftwareVersion': runningEndPointDict['BigDataSoftwareVersion'], 'HLLName' : runningEndPointDict['HighLevelLanguage']['HLLName'], 'HLLVersion' : runningEndPointDict['HighLevelLanguage']['HLLVersion'], 'NumBigDataJobsAllowedToSubmit': bigDataJobs, 'SiteName': runningEndPointDict['SiteName'], 'PublicIP': runningEndPointDict['PublicIP'], 'User': runningEndPointDict['User'], 'Port': runningEndPointDict['Port'], 'UsePilot': runningEndPointDict['UsePilot'], 'IsInteractive': runningEndPointDict['IsInteractive'], 'Arguments': arguments } del self.pendingTaskQueueJobs[tq][jobid] else: self.log.error( jobsToSubmit ) if not jobs and not self.pendingTaskQueueJobs: self.log.info( 'No matching jobs for %s found, skipping' % NameNode ) continue self.log.info( '___BigDataJobsTo Submit:', bigDataJobsToSubmit ) for directorName, JobsToSubmitDict in bigDataJobsToSubmit.items(): for runningEndPointName, jobsToSubmitDict in JobsToSubmitDict.items(): if self.directors[directorName]['isEnabled']: self.log.info( 'Requesting submission to %s of %s' % ( runningEndPointName, directorName ) ) director = self.directors[directorName]['director'] pool = self.pools[self.directors[directorName]['pool']] jobIDs = JobsToSubmitDict[runningEndPointName]['JobId'] jobName = JobsToSubmitDict[runningEndPointName]['JobName'] endpoint = JobsToSubmitDict[runningEndPointName]['BigDataEndpoint'] runningSiteName = JobsToSubmitDict[runningEndPointName]['SiteName'] NameNode = JobsToSubmitDict[runningEndPointName]['BigDataEndpointNameNode'] BigDataSoftware = JobsToSubmitDict[runningEndPointName]['BdSoftware'] BigDataSoftwareVersion = JobsToSubmitDict[runningEndPointName]['BdSoftwareVersion'] HLLName = JobsToSubmitDict[runningEndPointName]['HLLName'] HLLVersion = JobsToSubmitDict[runningEndPointName]['HLLVersion'] PublicIP = JobsToSubmitDict[runningEndPointName]['PublicIP'] User = JobsToSubmitDict[runningEndPointName]['User'] Port = JobsToSubmitDict[runningEndPointName]['Port'] UsePilot = JobsToSubmitDict[runningEndPointName]['UsePilot'] IsInteractive = JobsToSubmitDict[runningEndPointName]['IsInteractive'] Arguments = JobsToSubmitDict[runningEndPointName]['Arguments'] numBigDataJobsAllowed = JobsToSubmitDict[runningEndPointName]['NumBigDataJobsAllowedToSubmit'] ret = pool.generateJobAndQueueIt( director.submitBigDataJobs, args = ( endpoint, numBigDataJobsAllowed, runningSiteName, NameNode, BigDataSoftware, BigDataSoftwareVersion, HLLName, HLLVersion, PublicIP, Port, jobIDs, runningEndPointName, jobName, User, self.jobDataset, UsePilot, IsInteractive ), oCallback = self.callBack, oExceptionCallback = director.exceptionCallBack, blocking = False ) if not ret['OK']: # Disable submission until next iteration self.directors[directorName]['isEnabled'] = False else: time.sleep( self.am_getOption( 'ThreadStartDelay' ) ) if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() return DIRAC.S_OK()
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Add their Priorities 3.- Submit pilots """ self.__checkSubmitPools() self.directorDict = getResourceDict() result = taskQueueDB.getMatchingTaskQueues(self.directorDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] self.log.info('Found %s TaskQueues' % len(taskQueueDict)) if not taskQueueDict: self.log.info('No TaskQueue to Process') return S_OK() prioritySum = 0 waitingJobs = 0 for taskQueueID in taskQueueDict: taskQueueDict[taskQueueID]['TaskQueueID'] = taskQueueID prioritySum += taskQueueDict[taskQueueID]['Priority'] waitingJobs += taskQueueDict[taskQueueID]['Jobs'] self.log.info('Sum of Priorities %s' % prioritySum) if waitingJobs == 0: self.log.info('No waiting Jobs') return S_OK('No waiting Jobs') if prioritySum <= 0: return S_ERROR('Wrong TaskQueue Priorities') self.pilotsPerPriority = self.am_getOption( 'pilotsPerIteration') / prioritySum self.pilotsPerJob = self.am_getOption( 'pilotsPerIteration') / waitingJobs self.callBackLock.acquire() self.submittedPilots = 0 self.callBackLock.release() self.toSubmitPilots = 0 waitingStatusList = ['Submitted', 'Ready', 'Scheduled', 'Waiting'] timeLimitToConsider = Time.toString( Time.dateTime() - Time.hour * self.am_getOption("maxPilotWaitingHours")) for taskQueueID in taskQueueDict: self.log.verbose('Processing TaskQueue', taskQueueID) result = pilotAgentsDB.countPilots( { 'TaskQueueID': taskQueueID, 'Status': waitingStatusList }, None, timeLimitToConsider) if not result['OK']: self.log.error('Fail to get Number of Waiting pilots', result['Message']) waitingPilots = 0 else: waitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % taskQueueID, waitingPilots) result = self.submitPilotsForTaskQueue(taskQueueDict[taskQueueID], waitingPilots) if result['OK']: self.toSubmitPilots += result['Value'] self.log.info('Number of pilots to be Submitted %s' % self.toSubmitPilots) # Now wait until all Jobs in the Default ThreadPool are proccessed if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() self.log.info('Number of pilots Submitted %s' % self.submittedPilots) return S_OK()
def execute( self ): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Count Pending Jobs 3.- Submit VMs """ self.__checkSubmitPools() imagesToSubmit = {} for directorName, directorDict in self.directors.items(): self.log.verbose( 'Checking Director:', directorName ) for runningPodName in directorDict['director'].runningPods: result = virtualMachineDB.insertRunningPod(runningPodName) if not result['OK']: self.log.error( 'Error inserting/updating Running Pod %s in DB: %s' % ( runningPodName, result['Message'] ) ) continue result = virtualMachineDB.setRunningPodStatus(runningPodName) if not result['OK']: self.log.error( 'Error in setRunningPodStatus %s: %s' % ( runningPodName, result['Message'] ) ) continue result = virtualMachineDB.getRunningPodStatus(runningPodName) if not result['OK']: self.log.error( 'Error in getRunningPodStatus %s: %s' % ( runningPodName, result['Message'] ) ) continue status = result[ 'Value' ] if status == 'Active': self.log.info( 'RunningPod %s is Active' % ( runningPodName ) ) else: self.log.info( 'RunningPod %s is Unactive, do nothing' % ( runningPodName ) ) continue runningPodDict = directorDict['director'].runningPods[runningPodName] imageName = runningPodDict['Image'] instances = 0 result = virtualMachineDB.getInstancesByStatus( 'Running' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatus( 'Submitted' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatus( 'Wait_ssh_context' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatus( 'Contextualizing' ) if result['OK'] and imageName in result['Value']: instances += len( result['Value'][imageName] ) self.log.verbose( 'Checking Image %s:' % imageName, instances ) maxInstances = runningPodDict['MaxInstances'] if instances >= maxInstances: self.log.info( '%s >= %s Running instances reach MaxInstances for runningPod: %s, skipping' % ( instances, maxInstances, runningPodName ) ) continue cloudEndpointsStr = runningPodDict['CloudEndpoints'] # random cloudEndpoints = [element for element in cloudEndpointsStr.split( ',' )] shuffle( cloudEndpoints ) self.log.info( 'cloudEndpoints random failover: %s' % cloudEndpoints ) numVMs = 0 numVMsToSubmit = {} for endpoint in cloudEndpoints: self.log.info( 'Checking to submit to: %s' % endpoint ) strMaxEndpointInstances = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'maxEndpointInstances' ), "" ) if not strMaxEndpointInstances: self.log.info( 'CS CloudEndpoint %s has no define maxEndpointInstances option' % endpoint ) continue self.log.info( 'CS CloudEndpoint %s maxEndpointInstance: %s' % (endpoint,strMaxEndpointInstances) ) vmPolicy = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'vmPolicy' ), "" ) if not vmPolicy: self.log.info( 'CS CloudEndpoint %s has no define vmPolicy option' % endpoint ) continue self.log.info( 'CS CloudEndpoint %s vmPolicy: %s' % (endpoint,vmPolicy) ) endpointInstances = 0 result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Running', endpoint ) if result['OK'] and imageName in result['Value']: endpointInstances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Submitted', endpoint ) if result['OK'] and imageName in result['Value']: endpointInstances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Wait_ssh_context', endpoint ) if result['OK'] and imageName in result['Value']: endpointInstances += len( result['Value'][imageName] ) result = virtualMachineDB.getInstancesByStatusAndEndpoint( 'Contextualizing', endpoint ) if result['OK'] and imageName in result['Value']: endpointInstances += len( result['Value'][imageName] ) self.log.info( 'CS CloudEndpoint %s instances: %s, maxEndpointInstances: %s' % (endpoint,endpointInstances,strMaxEndpointInstances) ) maxEndpointInstances = int(strMaxEndpointInstances) if endpointInstances < maxEndpointInstances: if vmPolicy == 'elastic': numVMs = 1 if vmPolicy == 'static': numVMs = maxEndpointInstances - endpointInstances numVMsToSubmit.update({str(endpoint): int(numVMs) }) # site to match with TQ: siteToMatch = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, 'siteName' ), "" ) runningPodRequirementsDict = runningPodDict['Requirements'] runningPodRequirementsDict['Site'] = siteToMatch self.log.verbose( 'Requirements to match: ', runningPodRequirementsDict ) result = taskQueueDB.getMatchingTaskQueues( runningPodRequirementsDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] self.log.verbose( 'Task Queues Dict: ', taskQueueDict ) jobs = 0 priority = 0 cpu = 0 for tq in taskQueueDict: jobs += taskQueueDict[tq]['Jobs'] priority += taskQueueDict[tq]['Priority'] cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq]['CPUTime'] if not jobs: self.log.info( 'No matching jobs for %s found, skipping' % imageName ) continue if instances and ( cpu / instances ) < runningPodDict['CPUPerInstance']: self.log.info( 'Waiting CPU per Running instance %s < %s, skipping' % ( cpu / instances, runningPodDict['CPUPerInstance'] ) ) break if directorName not in imagesToSubmit: imagesToSubmit[directorName] = {} if imageName not in imagesToSubmit[directorName]: imagesToSubmit[directorName][imageName] = {} numVMs = numVMsToSubmit.get( endpoint ) imagesToSubmit[directorName][imageName] = { 'Jobs': jobs, 'TQPriority': priority, 'CPUTime': cpu, 'CloudEndpoint': endpoint, 'NumVMsToSubmit': numVMs, 'VMPolicy': vmPolicy, 'RunningPodName': runningPodName, 'VMPriority': runningPodDict['Priority'] } for directorName, imageOfJobsToSubmitDict in imagesToSubmit.items(): for imageName, jobsToSubmitDict in imageOfJobsToSubmitDict.items(): if self.directors[directorName]['isEnabled'] and numVMs > 0: self.log.info( 'Requesting submission of %s to %s' % ( imageName, directorName ) ) director = self.directors[directorName]['director'] pool = self.pools[self.directors[directorName]['pool']] endpoint = jobsToSubmitDict['CloudEndpoint'] runningPodName = jobsToSubmitDict['RunningPodName'] numVMs = jobsToSubmitDict['NumVMsToSubmit'] ret = pool.generateJobAndQueueIt( director.submitInstance, args = ( imageName, endpoint, numVMs, runningPodName ), oCallback = self.callBack, oExceptionCallback = director.exceptionCallBack, blocking = False ) if not ret['OK']: # Disable submission until next iteration self.directors[directorName]['isEnabled'] = False else: time.sleep( self.am_getOption( 'ThreadStartDelay' ) ) if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() return DIRAC.S_OK()
def execute(self): """Main Agent code: 1.- Query TaskQueueDB for existing TQs 2.- Count Pending Jobs 3.- Submit VMs """ self.__checkSubmitPools() imagesToSubmit = {} for directorName, directorDict in self.directors.items(): self.log.verbose('Checking Director:', directorName) for imageName in directorDict['director'].images: imageDict = directorDict['director'].images[imageName] instances = 0 result = virtualMachineDB.getInstancesByStatus('Running') if result['OK'] and imageName in result['Value']: instances += len(result['Value'][imageName]) result = virtualMachineDB.getInstancesByStatus('Submitted') if result['OK'] and imageName in result['Value']: instances += len(result['Value'][imageName]) self.log.verbose('Checking Image %s:' % imageName, instances) maxInstances = imageDict['MaxInstances'] if instances >= maxInstances: self.log.info( '%s >= %s Running instances of %s, skipping' % (instances, maxInstances, imageName)) continue imageRequirementsDict = imageDict['RequirementsDict'] result = taskQueueDB.getMatchingTaskQueues( imageRequirementsDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] jobs = 0 priority = 0 cpu = 0 for tq in taskQueueDict: jobs += taskQueueDict[tq]['Jobs'] priority += taskQueueDict[tq]['Priority'] cpu += taskQueueDict[tq]['Jobs'] * taskQueueDict[tq][ 'CPUTime'] if not jobs: self.log.info('No matching jobs for %s found, skipping' % imageName) continue if instances and (cpu / instances) < imageDict['CPUPerInstance']: self.log.info( 'Waiting CPU per Running instance %s < %s, skipping' % (cpu / instances, imageDict['CPUPerInstance'])) continue if directorName not in imagesToSubmit: imagesToSubmit[directorName] = {} if imageName not in imagesToSubmit[directorName]: imagesToSubmit[directorName][imageName] = {} imagesToSubmit[directorName][imageName] = { 'Jobs': jobs, 'TQPriority': priority, 'CPUTime': cpu, 'VMPriority': imageDict['Priority'] } for directorName, imageDict in imagesToSubmit.items(): for imageName, jobsDict in imageDict.items(): if self.directors[directorName]['isEnabled']: self.log.info('Requesting submission of %s to %s' % (imageName, directorName)) director = self.directors[directorName]['director'] pool = self.pools[self.directors[directorName]['pool']] ret = pool.generateJobAndQueueIt( director.submitInstance, args=(imageName, self.workDir), oCallback=self.callBack, oExceptionCallback=director.exceptionCallBack, blocking=False) if not ret['OK']: # Disable submission until next iteration self.directors[directorName]['isEnabled'] = False else: time.sleep(self.am_getOption('ThreadStartDelay')) if 'Default' in self.pools: # only for those in "Default' thread Pool # for pool in self.pools: self.pools['Default'].processAllResults() return DIRAC.S_OK()