def submitInstance( self, imageName, workDir ): """ """ self.log.info( 'Submitting', imageName ) if imageName not in self.images: return DIRAC.S_ERROR( 'Unknown Image: %s' % imageName ) retDict = virtualMachineDB.insertInstance( imageName, imageName ) if not retDict['OK']: return retDict instanceID = retDict['Value'] retDict = self._submitInstance( imageName, workDir ) if not retDict['OK']: return retDict uniqueID = retDict[ 'Value' ] retDict = virtualMachineDB.setInstanceUniqueID( instanceID, uniqueID ) if not retDict['OK']: return retDict retDict = virtualMachineDB.declareInstanceSubmitted( uniqueID ) if not retDict['OK']: return retDict return DIRAC.S_OK( imageName )
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ vmTypeList = self.vmTypeDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['VO'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for vmType in vmTypeList: if 'Tag' in self.vmTypeDict[vmType]['ParametersDict']: tags += self.vmTypeDict[vmType]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) tqDict['SubmitPool'] = "wenmrPool" self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.wmsClient.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] vmTypeList = self.vmTypeDict.keys() random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]['CE'] ceName = self.vmTypeDict[vmType]['CEName'] vmTypeName = self.vmTypeDict[vmType]['VMType'] siteName = self.vmTypeDict[vmType]['Site'] platform = self.vmTypeDict[vmType]['Platform'] vmTypeTags = self.vmTypeDict[vmType]['ParametersDict'].get( 'Tag', []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]['MaxInstances']) processorTags = [] # vms support WholeNode naturally processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if 'CPUTime' in self.vmTypeDict[vmType]['ParametersDict']: vmTypeCPUTime = int( self.vmTypeDict[vmType]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), vmType)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get('Auth') if authType and authType.lower() in ['x509', 'voms']: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCE(ce) if not result['OK']: continue ce.setProxy(result['Value']) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info('%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) #result = S_OK() if not result['OK']: self.log.error('Failed submission to queue %s:\n' % vmType, result['Message']) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.vmTypeDict[vmType]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfProcessors']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB: %s' % result['Message']) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def submitInstance( self, imageName, endpoint, numVMsToSubmit, runningPodName ): """ """ # warning: instanceID is the DIRAC instance id, while uniqueID is unique for a particular endpoint self.log.info( '*** Preparing to submitting VM of image: ', imageName ) self.log.info( '******* num of VMs to sumbit: ', numVMsToSubmit ) self.log.info( '******* of running pod: ', runningPodName ) self.log.info( '******* destination: ', endpoint ) if runningPodName not in self.runningPods: return S_ERROR( 'Unknown Running Pod: %s' % runningPodName ) for numVM in range(1,numVMsToSubmit+1): self.log.info( '********** Preparing to submitting VM number %s of %s VMs' % ( numVM, numVMsToSubmit ) ) dictVMSubmitted = {} dictVMDBrecord = {} # FIRST, insert the instance into the DB ! newInstance = virtualMachineDB.insertInstance( imageName, imageName, endpoint, runningPodName ) if not newInstance[ 'OK' ]: return newInstance instanceID = newInstance[ 'Value' ] runningRequirementsDict = self.runningPods[runningPodName]['Requirements'] cpuTime = runningRequirementsDict['CPUTime'] if not cpuTime: return S_ERROR( 'Unknown CPUTime in Requirements of the RunningPod %s' % runningPodName ) submitPool = runningRequirementsDict['SubmitPool'] if not submitPool: return S_ERROR( 'Unknown submitPool in Requirements of the RunningPod %s' % runningPodName ) dictVMSubmitted = self._submitInstance( imageName, endpoint, instanceID, runningRequirementsDict ) if not dictVMSubmitted[ 'OK' ]: return dictVMSubmitted #########CloudStack2 adn CloudStack3 drivers have the bug of a single VM creation produces two VMs #########To deal with this CloudStack preaty feature we first startNewInstance inside #########VMDIRECTOR._submitInstance, and second we declare two VMs #########CloudStack check to preaty feature driver = gConfig.getValue( "/Resources/VirtualMachines/CloudEndpoints/%s/%s" % ( endpoint, "cloudDriver" ) ) if driver == "CloudStack": virtualMachineDB.insertInstance( imageName, imageName, endpoint, runningPodName ) if driver == "nova-1.1" or driver =="rocci-1.1": ( uniqueID, publicIP ) = dictVMSubmitted['Value'] dictVMDBrecord = virtualMachineDB.setPublicIP( instanceID, publicIP ) if not dictVMDBrecord['OK']: return dictVMDBrecord else: uniqueID = dictVMSubmitted['Value'] dictVMDBrecord = virtualMachineDB.setInstanceUniqueID( instanceID, uniqueID ) if not dictVMDBrecord['OK']: return dictVMDBrecord #########CloudStack check to preaty feature if driver == "CloudStack": virtualMachineDB.setInstanceUniqueID( str( int( instanceID ) + 1 ), str( int( uniqueID ) - 1 ) ) # check contextMethod and update status if need ssh contextualization: contextMethod = gConfig.getValue( "/Resources/VirtualMachines/Images/%s/%s" % ( imageName, "contextMethod" ) ) if contextMethod == 'ssh': dictVMDBrecord = virtualMachineDB.declareInstanceWait_ssh_context( uniqueID ) if not dictVMDBrecord['OK']: return dictVMDBrecord else: dictVMDBrecord = virtualMachineDB.declareInstanceSubmitted( uniqueID ) if not dictVMDBrecord['OK']: return dictVMDBrecord #########CloudStack check to preaty feature if driver == "CloudStack": dictVMDBrecord = virtualMachineDB.declareInstanceSubmitted( str( int( uniqueID ) - 1 ) ) return S_OK( imageName )
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] images = self.imageDict.keys() random.shuffle(images) totalSubmittedPilots = 0 matchedQueues = 0 for image in images: # Check if the image failed previously #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor #if failedCount != 0: # self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) ) # self.failedImages[image] += 1 # continue print "AT >>> image parameters:", image for key, value in self.imageDict[image].items(): print key, value ce = self.imageDict[image]['CE'] ceName = self.imageDict[image]['CEName'] imageName = self.imageDict[image]['ImageName'] siteName = self.imageDict[image]['Site'] platform = self.imageDict[image]['Platform'] siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.imageDict[image]['MaxInstances']) if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (imageName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (imageName, siteName)) continue if 'CPUTime' in self.imageDict[image]['ParametersDict']: imageCPUTime = int( self.imageDict[image]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % image) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue print "AT >>> getMatchingTaskQueues ceDict", ceDict result = rpcMatcher.getMatchingTaskQueues(ceDict) print result if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % image) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), image)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, image)) # Get the working proxy #cpuTime = imageCPUTime + 86400 #self.log.verbose( "Getting cloud proxy for %s/%s %d long" % ( self.cloudDN, self.cloudGroup, cpuTime ) ) #result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, cpuTime ) #if not result['OK']: # return result #self.proxy = result['Value'] #ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % image) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \ ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) ) # Limit the number of clouds to submit to MAX_PILOTS_TO_SUBMIT vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, image)) result = ce.createInstances(vmsToSubmit) print "AT >>> createInstances", result, image if not result['OK']: self.log.error('Failed submission to queue %s:\n' % image, result['Message']) self.failedImages.setdefault(image, 0) self.failedImages[image] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), imageName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, imageName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfCPUs']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', '', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB') self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()