def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tags'] siteMask = siteName in siteMaskList processorTags = [] for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors.keys(): self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir, processors) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def _submitPilots(self, workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob): """ This method does the actual pilot submission to the Grid RB The logic is as follows: - If there are no available RB it return error - If there is no VOMS extension in the proxy, return error - It creates a temp directory - Prepare a JDL it has some part common to gLite and LCG (the payload description) it has some part specific to each middleware """ taskQueueID = taskQueueDict['TaskQueueID'] # ownerDN = taskQueueDict['OwnerDN'] credDict = proxy.getCredentials()['Value'] ownerDN = credDict['identity'] ownerGroup = credDict['group'] if not self.resourceBrokers: # Since we can exclude RBs from the list, it may become empty return S_ERROR(ERROR_RB) # Need to get VOMS extension for the later interactions with WMS ret = gProxyManager.getVOMSAttributes(proxy) if not ret['OK']: self.log.error(ERROR_VOMS, ret['Message']) return S_ERROR(ERROR_VOMS) if not ret['Value']: return S_ERROR(ERROR_VOMS) vomsGroup = ret['Value'][0] workingDirectory = tempfile.mkdtemp(prefix='TQ_%s_' % taskQueueID, dir=workDir) self.log.verbose('Using working Directory:', workingDirectory) # Write JDL retDict = self._prepareJDL(taskQueueDict, workingDirectory, pilotOptions, pilotsPerJob, ceMask, submitPrivatePilot, privateTQ) jdl = retDict['JDL'] pilotRequirements = retDict['Requirements'] rb = retDict['RB'] if not jdl: try: shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_JDL) # Check that there are available queues for the Job: if self.enableListMatch: availableCEs = [] now = Time.dateTime() availableCEs = self.listMatchCache.get(pilotRequirements) if availableCEs == False: availableCEs = self._listMatch(proxy, jdl, taskQueueID, rb) if availableCEs != False: self.log.verbose('LastListMatch', now) self.log.verbose('AvailableCEs ', availableCEs) self.listMatchCache.add( pilotRequirements, self.listMatchDelay * 60, value=availableCEs) # it is given in minutes if not availableCEs: try: shutil.rmtree(workingDirectory) except: pass return S_ERROR(ERROR_CE + ' TQ: %d' % taskQueueID) # Now we are ready for the actual submission, so self.log.verbose('Submitting Pilots for TaskQueue', taskQueueID) submitRet = self._submitPilot(proxy, pilotsPerJob, jdl, taskQueueID, rb) try: shutil.rmtree(workingDirectory) except: pass if not submitRet: return S_ERROR('Pilot Submission Failed for TQ %d ' % taskQueueID) # pilotReference, resourceBroker = submitRet submittedPilots = 0 if pilotsPerJob != 1 and len(submitRet) != pilotsPerJob: # Parametric jobs are used for pilotReference, resourceBroker in submitRet: pilotReference = self._getChildrenReferences( proxy, pilotReference, taskQueueID) submittedPilots += len(pilotReference) pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements) else: for pilotReference, resourceBroker in submitRet: pilotReference = [pilotReference] submittedPilots += len(pilotReference) pilotAgentsDB.addPilotTQReference(pilotReference, taskQueueID, ownerDN, ownerGroup, resourceBroker, self.gridMiddleware, pilotRequirements) # add some sleep here time.sleep(0.1 * submittedPilots) if pilotsToSubmit > pilotsPerJob: # Additional submissions are necessary, need to get a new token and iterate. pilotsToSubmit -= pilotsPerJob result = gProxyManager.requestToken( ownerDN, ownerGroup, max(pilotsToSubmit, self.maxJobsInFillMode)) if not result['OK']: self.log.error(ERROR_TOKEN, result['Message']) result = S_ERROR(ERROR_TOKEN) result['Value'] = submittedPilots return result (token, numberOfUses) = result['Value'] for option in pilotOptions: if option.find('-o /Security/ProxyToken=') == 0: pilotOptions.remove(option) pilotOptions.append('-o /Security/ProxyToken=%s' % token) pilotsPerJob = max( 1, min(pilotsPerJob, int(numberOfUses / self.maxJobsInFillMode))) result = self._submitPilots(workDir, taskQueueDict, pilotOptions, pilotsToSubmit, ceMask, submitPrivatePilot, privateTQ, proxy, pilotsPerJob) if not result['OK']: if 'Value' not in result: result['Value'] = 0 result['Value'] += submittedPilots return result submittedPilots += result['Value'] return S_OK(submittedPilots)
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = siteName in siteMaskList if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: return S_ERROR('CPU time limit is not specified for queue %s' % queue) if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 result = gProxyManager.getPilotProxyFromDIRACGroup( self.genericPilotDN, self.genericPilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: %s' % (queue, result['Message'])) continue totalSlots = result['Value'] self.log.verbose(result['Message']) ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] result = taskQueueDB.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found') continue totalTQJobs = 0 for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) self.log.verbose( 'Available slots=%d, TQ jobs=%d, Pilots to submit=%d' % (totalSlots, totalTQJobs, pilotsToSubmit)) if pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy) if not result['OK']: return result executable = result['Value'] result = ce.submitJob(executable, '', pilotsToSubmit) if not result['OK']: self.log.error('Failed submission to queue %s:' % queue, result['Message']) continue # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.genericPilotDN, self.genericPilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: %s' % result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfuly submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: %s' % result['Message']) continue return S_OK()
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ vmTypeList = self.vmTypeDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['VO'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for vmType in vmTypeList: if 'Tag' in self.vmTypeDict[vmType]['ParametersDict']: tags += self.vmTypeDict[vmType]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) tqDict['SubmitPool'] = "wenmrPool" self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.wmsClient.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] vmTypeList = self.vmTypeDict.keys() random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]['CE'] ceName = self.vmTypeDict[vmType]['CEName'] vmTypeName = self.vmTypeDict[vmType]['VMType'] siteName = self.vmTypeDict[vmType]['Site'] platform = self.vmTypeDict[vmType]['Platform'] vmTypeTags = self.vmTypeDict[vmType]['ParametersDict'].get( 'Tag', []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]['MaxInstances']) processorTags = [] # vms support WholeNode naturally processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if 'CPUTime' in self.vmTypeDict[vmType]['ParametersDict']: vmTypeCPUTime = int( self.vmTypeDict[vmType]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), vmType)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get('Auth') if authType and authType.lower() in ['x509', 'voms']: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCE(ce) if not result['OK']: continue ce.setProxy(result['Value']) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info('%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) #result = S_OK() if not result['OK']: self.log.error('Failed submission to queue %s:\n' % vmType, result['Message']) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.vmTypeDict[vmType]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfProcessors']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB: %s' % result['Message']) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def createVMs(self): """Go through defined computing elements and submit jobs if necessary""" vmTypeList = list(self.vmTypeDict.keys()) # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999} if self.vo: tqDict["VO"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites tags = [] for vmType in vmTypeList: if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]: tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"] tqDict["Tag"] = list(set(tags)) self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result["Value"]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result["Value"][tqID]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": testSites.add(site) totalWaitingJobs += result["Value"][tqID]["Jobs"] tqIDList = list(result["Value"].keys()) result = virtualMachineDB.getInstanceCounters("Status", {}) totalVMs = 0 if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted", "Running"]: totalVMs += result["Value"][status] self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.siteClient.getUsableSites() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result.get("Value", []) vmTypeList = list(self.vmTypeDict.keys()) random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]["CE"] ceName = self.vmTypeDict[vmType]["CEName"] vmTypeName = self.vmTypeDict[vmType]["VMType"] siteName = self.vmTypeDict[vmType]["Site"] platform = self.vmTypeDict[vmType]["Platform"] vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get("Tag", []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"]) processorTags = [] # vms support WholeNode naturally processorTags.append("WholeNode") if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]: vmTypeCPUTime = int(self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict["JobType"] = "Test" if self.vo: ceDict["VO"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result["OK"]: continue ceDict["Platform"] = result["Value"] ceDict["Tag"] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.verbose("No matching TQs found for %s" % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = list(taskQueueDict.keys()) for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] self.log.verbose( "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType) ) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters("Status", {"Endpoint": endpoint}) if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted"]: totalWaitingVMs += result["Value"][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose("%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get("Auth") if authType and authType.lower() in ["x509", "voms"]: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCloud(ce) if not result["OK"]: continue ce.setProxy(result["Value"]) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug("%s: No slots available" % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( "%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) # result = S_OK() if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"]) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result["Value"] totalSubmittedPilots += len(vmDict) self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]["InstanceID"] endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName) result = virtualMachineDB.insertInstance(uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result["OK"]: continue pRef = "vm://" + ceName + "/" + diracUUID + ":00" pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, "", "", self.localhost, "Cloud", stampDict) if not result["OK"]: self.log.error("Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"]) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues) ) return S_OK()
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] images = self.imageDict.keys() random.shuffle(images) totalSubmittedPilots = 0 matchedQueues = 0 for image in images: # Check if the image failed previously #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor #if failedCount != 0: # self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) ) # self.failedImages[image] += 1 # continue print "AT >>> image parameters:", image for key, value in self.imageDict[image].items(): print key, value ce = self.imageDict[image]['CE'] ceName = self.imageDict[image]['CEName'] imageName = self.imageDict[image]['ImageName'] siteName = self.imageDict[image]['Site'] platform = self.imageDict[image]['Platform'] siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.imageDict[image]['MaxInstances']) if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (imageName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (imageName, siteName)) continue if 'CPUTime' in self.imageDict[image]['ParametersDict']: imageCPUTime = int( self.imageDict[image]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % image) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue print "AT >>> getMatchingTaskQueues ceDict", ceDict result = rpcMatcher.getMatchingTaskQueues(ceDict) print result if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % image) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), image)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, image)) # Get the working proxy #cpuTime = imageCPUTime + 86400 #self.log.verbose( "Getting cloud proxy for %s/%s %d long" % ( self.cloudDN, self.cloudGroup, cpuTime ) ) #result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, cpuTime ) #if not result['OK']: # return result #self.proxy = result['Value'] #ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % image) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \ ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) ) # Limit the number of clouds to submit to MAX_PILOTS_TO_SUBMIT vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, image)) result = ce.createInstances(vmsToSubmit) print "AT >>> createInstances", result, image if not result['OK']: self.log.error('Failed submission to queue %s:\n' % image, result['Message']) self.failedImages.setdefault(image, 0) self.failedImages[image] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), imageName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, imageName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfCPUs']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', '', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB') self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()