def export_getSitesResources( self, siteNames ): resources = Resources.Resources() if siteNames is None: siteNames = Resources.getSites() if not siteNames[ 'OK' ]: return siteNames siteNames = siteNames[ 'Value' ] if isinstance( siteNames, str ): siteNames = [ siteNames ] sitesRes = {} for siteName in siteNames: res = {} res[ 'ces' ] = resources.getEligibleResources( 'Computing', { 'Site': siteName } ) ses = resources.getEligibleStorageElements( { 'Site': siteName } ) sesHosts = CSHelpers.getStorageElementsHosts( ses ) if not sesHosts[ 'OK' ]: return sesHosts res[ 'ses' ] = list( set( sesHosts[ 'Value' ] ) ) sitesRes[ siteName ] = res return S_OK( sitesRes )
def doCommand( self ): """ Returns running and runned jobs, querying the WMSHistory for the last self.args[0] hours :params: :attr:`sites`: list of sites (when not given, take every sites) :returns: """ if not 'hours' in self.args: return S_ERROR( 'Number of hours not specified' ) hours = self.args[ 'hours' ] sites = None if 'sites' in self.args: sites = self.args[ 'sites' ] if sites is None: #FIXME: pointing to the CSHelper instead # sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} ) # if not sources[ 'OK' ]: # return sources # sources = [ si[0] for si in sources[ 'Value' ] ] sites = Resources.getSites() if not sites[ 'OK' ]: return sites sites = sites[ 'Value' ] if not sites: return S_ERROR( 'Sites is empty' ) fromD = datetime.utcnow() - timedelta( hours = hours ) toD = datetime.utcnow() runJobs = self.rClient.getReport( 'WMSHistory', 'NumberOfJobs', fromD, toD, {}, 'Site') if not runJobs[ 'OK' ]: return runJobs runJobs = runJobs[ 'Value' ] if not 'data' in runJobs: return S_ERROR( 'Missing data key' ) if not 'granularity' in runJobs: return S_ERROR( 'Missing granularity key' ) singlePlots = {} for site, value in runJobs[ 'data' ].items(): if site in sites: plot = {} plot[ 'data' ] = { site: value } plot[ 'granularity' ] = runJobs[ 'granularity' ] singlePlots[ site ] = plot return S_OK( singlePlots ) ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
def doCommand( self ): """ Returns failed jobs using the DIRAC accounting system for every site for the last self.args[0] hours :params: :attr:`sites`: list of sites (when not given, take every site) :returns: """ if not 'hours' in self.args: return S_ERROR( 'Number of hours not specified' ) hours = self.args[ 'hours' ] sites = None if 'sites' in self.args: sites = self.args[ 'sites' ] if sites is None: #FIXME: pointing to the CSHelper instead # sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} ) # if not sources[ 'OK' ]: # return sources # sources = [ si[0] for si in sources[ 'Value' ] ] sites = Resources.getSites() if not sites[ 'OK' ]: return sites sites = sites[ 'Value' ] if not sites: return S_ERROR( 'Sites is empty' ) fromD = datetime.utcnow() - timedelta( hours = hours ) toD = datetime.utcnow() failedPilots = self.rClient.getReport( 'Pilot', 'NumberOfPilots', fromD, toD, { 'GridStatus' : [ 'Aborted' ], 'Site' : sites }, 'Site' ) if not failedPilots[ 'OK' ]: return failedPilots failedPilots = failedPilots[ 'Value' ] if not 'data' in failedPilots: return S_ERROR( 'Missing data key' ) if not 'granularity' in failedPilots: return S_ERROR( 'Missing granularity key' ) singlePlots = {} for site, value in failedPilots[ 'data' ].items(): if site in sites: plot = {} plot[ 'data' ] = { site: value } plot[ 'granularity' ] = failedPilots[ 'granularity' ] singlePlots[ site ] = plot return S_OK( singlePlots )
def getGOCSites(diracSites=None): # FIXME: THIS SHOULD GO INTO Resources HELPER if diracSites is None: diracSites = Resources.getSites() if not diracSites["OK"]: return diracSites diracSites = diracSites["Value"] gocSites = [] for diracSite in diracSites: gocSite = getGOCSiteName(diracSite) if not gocSite["OK"]: continue gocSites.append(gocSite["Value"]) return S_OK(list(set(gocSites)))
def setPlatform( self, platform ): """Developer function: sets the target platform, e.g. Linux_x86_64_glibc-2.5. This platform is in the form of what it is returned by the dirac-platform script (or dirac-architecture if your extension provides it) """ kwargs = {'platform':platform} if not isinstance( platform, basestring ): return self._reportError( "Expected string for platform", **kwargs ) if not platform.lower() == 'any': availablePlatforms = Resources.getDIRACPlatforms() if not availablePlatforms['OK']: return self._reportError( "Can't check for platform", **kwargs ) if platform in availablePlatforms['Value']: self._addParameter( self.workflow, 'Platform', 'JDL', platform, 'Platform ( Operating System )' ) else: return self._reportError( "Invalid platform", **kwargs ) return S_OK()
def setPlatform(self, platform): """Developer function: sets the target platform, e.g. Linux_x86_64_glibc-2.5. This platform is in the form of what it is returned by the dirac-platform script (or dirac-architecture if your extension provides it) """ kwargs = {"platform": platform} if not type(platform) == type(" "): return self._reportError("Expected string for platform", **kwargs) if not platform.lower() == "any": availablePlatforms = Resources.getDIRACPlatforms() if not availablePlatforms["OK"]: return self._reportError("Can't check for platform", **kwargs) if platform in availablePlatforms["Value"]: self._addParameter(self.workflow, "Platform", "JDL", platform, "Platform ( Operating System )") else: return self._reportError("Invalid platform", **kwargs) return S_OK()
def doMaster( self ): ''' Master method, which looks little bit spaguetti code, sorry ! - It gets all Sites. - It gets all StorageElements As there is no bulk query, it compares with what we have on the database. It queries a portion of them. ''' sites = Resources.getSites() if not sites[ 'OK' ]: return sites sites = sites[ 'Value' ] ses = self.resources.getEligibleStorageElements() if not ses[ 'OK' ]: return ses ses = ses[ 'Value' ] elementNames = sites + ses # sourceQuery = self.rmClient.selectTransferCache( meta = { 'columns' : [ 'SourceName' ] } ) # if not sourceQuery[ 'OK' ]: # return sourceQuery # sourceQuery = [ element[0] for element in sourceQuery[ 'Value' ] ] # # sourceElementsToQuery = list( set( elementNames ).difference( set( sourceQuery ) ) ) gLogger.info( 'Processing %s' % ', '.join( elementNames ) ) for metric in [ 'Quality', 'FailedTransfers' ]: for direction in [ 'Source', 'Destination' ]: # 2 hours of window result = self.doNew( ( 2, elementNames, direction, metric ) ) if not result[ 'OK' ]: self.metrics[ 'failed' ].append( result ) return S_OK( self.metrics ) ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
from DIRAC.FrameworkSystem.Client.BundleDeliveryClient import BundleDeliveryClient bdc = BundleDeliveryClient() result = bdc.syncCAs() if result['OK']: result = bdc.syncCRLs() except: DIRAC.gLogger.exception('Could not import BundleDeliveryClient') pass if not skipCAChecks: Script.localCfg.deleteOption('/DIRAC/Security/SkipCAChecks') if ceName or siteName: # This is used in the pilot context, we should have a proxy and access to CS Script.enableCS() resources = Resources.Resources(vo=vo) if not siteName: if ceName: result = resources.getSiteForResource('Computing', ceName) if result['OK']: site = result['Value'] result = resources.getSiteFullName(site) if result['OK']: siteName = result['Value'] if siteName: DIRAC.gLogger.notice('Setting /LocalSite/Site = %s' % siteName) Script.localCfg.addDefaultEntry('/LocalSite/Site', siteName) DIRAC.__siteName = False if ceName:
def createVMs(self): """Go through defined computing elements and submit jobs if necessary""" vmTypeList = list(self.vmTypeDict.keys()) # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999} if self.vo: tqDict["VO"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites tags = [] for vmType in vmTypeList: if "Tag" in self.vmTypeDict[vmType]["ParametersDict"]: tags += self.vmTypeDict[vmType]["ParametersDict"]["Tag"] tqDict["Tag"] = list(set(tags)) self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) matcherClient = MatcherClient() result = matcherClient.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result["Value"]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result["Value"][tqID]: if "Sites" in result["Value"][tqID]: for site in result["Value"][tqID]["Sites"]: if site.lower() != "any": testSites.add(site) totalWaitingJobs += result["Value"][tqID]["Jobs"] tqIDList = list(result["Value"].keys()) result = virtualMachineDB.getInstanceCounters("Status", {}) totalVMs = 0 if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted", "Running"]: totalVMs += result["Value"][status] self.log.info("Total %d jobs in %d task queues with %d VMs" % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = self.siteClient.getUsableSites() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result.get("Value", []) vmTypeList = list(self.vmTypeDict.keys()) random.shuffle(vmTypeList) totalSubmittedPilots = 0 matchedQueues = 0 for vmType in vmTypeList: ce = self.vmTypeDict[vmType]["CE"] ceName = self.vmTypeDict[vmType]["CEName"] vmTypeName = self.vmTypeDict[vmType]["VMType"] siteName = self.vmTypeDict[vmType]["Site"] platform = self.vmTypeDict[vmType]["Platform"] vmTypeTags = self.vmTypeDict[vmType]["ParametersDict"].get( "Tag", []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.vmTypeDict[vmType]["MaxInstances"]) processorTags = [] # vms support WholeNode naturally processorTags.append("WholeNode") if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (vmTypeName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (vmTypeName, siteName)) continue if "CPUTime" in self.vmTypeDict[vmType]["ParametersDict"]: vmTypeCPUTime = int( self.vmTypeDict[vmType]["ParametersDict"]["CPUTime"]) else: self.log.warn( "CPU time limit is not specified for queue %s, skipping..." % vmType) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict["JobType"] = "Test" if self.vo: ceDict["VO"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result["OK"]: continue ceDict["Platform"] = result["Value"] ceDict["Tag"] = list(set(processorTags + vmTypeTags)) # Get the number of eligible jobs for the target site/queue result = matcherClient.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error( "Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.verbose("No matching TQs found for %s" % vmType) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = list(taskQueueDict.keys()) for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] self.log.verbose( "%d job(s) from %d task queue(s) are eligible for %s queue" % (totalTQJobs, len(tqIDList), vmType)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( "Status", {"Endpoint": endpoint}) if result["OK"]: for status in result["Value"]: if status in ["New", "Submitted"]: totalWaitingVMs += result["Value"][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, vmType)) # Get proxy to be used to connect to the cloud endpoint authType = ce.parameters.get("Auth") if authType and authType.lower() in ["x509", "voms"]: self.log.verbose("Getting cloud proxy for %s/%s" % (siteName, ceName)) result = getProxyFileForCloud(ce) if not result["OK"]: continue ce.setProxy(result["Value"]) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug("%s: No slots available" % vmType) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info("%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d" % (vmType, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit)) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) if vmsToSubmit == 0: continue self.log.info("Going to submit %d VMs to %s queue" % (vmsToSubmit, vmType)) result = ce.createInstances(vmsToSubmit) # result = S_OK() if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % vmType, result["Message"]) self.failedVMTypes.setdefault(vmType, 0) self.failedVMTypes[vmType] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result["Value"] totalSubmittedPilots += len(vmDict) self.log.info("Submitted %d VMs to %s@%s" % (len(vmDict), vmTypeName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]["InstanceID"] endpoint = "%s::%s" % (self.vmTypeDict[vmType]["Site"], ceName) result = virtualMachineDB.insertInstance( uuID, vmTypeName, diracUUID, endpoint, self.vo) if not result["OK"]: continue pRef = "vm://" + ceName + "/" + diracUUID + ":00" pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, "", "", self.localhost, "Cloud", stampDict) if not result["OK"]: self.log.error( "Failed to insert pilots into the PilotAgentsDB: %s" % result["Message"]) self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def getEndpoints(self, resourceDict): """Get the list of relevant CEs and their descriptions""" self.vmTypeDict = {} ceFactory = EndpointFactory() result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod) if not result["OK"]: return result opParameters = result["Value"] for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get("Tag", []) if isinstance(ceTags, str): ceTags = fromChar(ceTags) ceMaxRAM = ceDict.get("MaxRAM", None) qDict = ceDict.pop("VMTypes") for vmType in qDict: vmTypeName = "%s_%s" % (ce, vmType) self.vmTypeDict[vmTypeName] = {} self.vmTypeDict[vmTypeName]["ParametersDict"] = qDict[ vmType] self.vmTypeDict[vmTypeName]["ParametersDict"][ "VMType"] = vmType self.vmTypeDict[vmTypeName]["ParametersDict"][ "Site"] = site self.vmTypeDict[vmTypeName]["ParametersDict"][ "Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown") self.vmTypeDict[vmTypeName]["ParametersDict"][ "CPUTime"] = 99999999 vmTypeTags = self.vmTypeDict[vmTypeName][ "ParametersDict"].get("Tag") if vmTypeTags and isinstance(vmTypeTags, str): vmTypeTags = fromChar(vmTypeTags) self.vmTypeDict[vmTypeName]["ParametersDict"][ "Tag"] = vmTypeTags if ceTags: if vmTypeTags: allTags = list(set(ceTags + vmTypeTags)) self.vmTypeDict[vmTypeName]["ParametersDict"][ "Tag"] = allTags else: self.vmTypeDict[vmTypeName]["ParametersDict"][ "Tag"] = ceTags maxRAM = self.vmTypeDict[vmTypeName]["ParametersDict"].get( "MaxRAM") maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.vmTypeDict[vmTypeName]["ParametersDict"][ "MaxRAM"] = maxRAM ceWholeNode = ceDict.get("WholeNode", "true") wholeNode = self.vmTypeDict[vmTypeName][ "ParametersDict"].get("WholeNode", ceWholeNode) if wholeNode.lower() in ("yes", "true"): self.vmTypeDict[vmTypeName][ "ParametersDict"].setdefault("Tag", []) self.vmTypeDict[vmTypeName]["ParametersDict"][ "Tag"].append("WholeNode") platform = "" if "Platform" in self.vmTypeDict[vmTypeName][ "ParametersDict"]: platform = self.vmTypeDict[vmTypeName][ "ParametersDict"]["Platform"] elif "Platform" in ceDict: platform = ceDict["Platform"] if platform and platform not in self.platforms: self.platforms.append(platform) if "Platform" not in self.vmTypeDict[vmTypeName][ "ParametersDict"] and platform: result = Resources.getDIRACPlatform(platform) if result["OK"]: self.vmTypeDict[vmTypeName]["ParametersDict"][ "Platform"] = result["Value"][0] ceVMTypeDict = dict(ceDict) ceVMTypeDict["CEName"] = ce ceVMTypeDict["VO"] = self.vo ceVMTypeDict["VMType"] = vmType ceVMTypeDict["RunningPod"] = self.runningPod ceVMTypeDict["CSServers"] = gConfig.getValue( "/DIRAC/Configuration/Servers", []) ceVMTypeDict.update( self.vmTypeDict[vmTypeName]["ParametersDict"]) # Allow a resource-specifc CAPath to be set (as some clouds have their own CAs) # Otherwise fall back to the system-wide default(s) if "CAPath" not in ceVMTypeDict: ceVMTypeDict["CAPath"] = gConfig.getValue( "/DIRAC/Security/CAPath", "/opt/dirac/etc/grid-security/certificates/cas.pem" ) # Generate the CE object for the vmType or pick the already existing one # if the vmType definition did not change vmTypeHash = self.__generateVMTypeHash(ceVMTypeDict) if vmTypeName in self.vmTypeCECache and self.vmTypeCECache[ vmTypeName]["Hash"] == vmTypeHash: vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] else: result = ceFactory.getCEObject(parameters=ceVMTypeDict) if not result["OK"]: return result self.vmTypeCECache.setdefault(vmTypeName, {}) self.vmTypeCECache[vmTypeName]["Hash"] = vmTypeHash self.vmTypeCECache[vmTypeName]["CE"] = result["Value"] vmTypeCE = self.vmTypeCECache[vmTypeName]["CE"] vmTypeCE.setBootstrapParameters(opParameters) self.vmTypeDict[vmTypeName]["CE"] = vmTypeCE self.vmTypeDict[vmTypeName]["CEName"] = ce self.vmTypeDict[vmTypeName]["CEType"] = ceDict["CEType"] self.vmTypeDict[vmTypeName]["Site"] = site self.vmTypeDict[vmTypeName]["VMType"] = vmType self.vmTypeDict[vmTypeName]["Platform"] = platform self.vmTypeDict[vmTypeName]["MaxInstances"] = ceDict[ "MaxInstances"] if not self.vmTypeDict[vmTypeName]["CE"].isValid(): self.log.error( "Failed to instantiate CloudEndpoint for %s" % vmTypeName) continue if site not in self.sites: self.sites.append(site) return S_OK()
def doCommand(self): """ Returns running and runned jobs, querying the WMSHistory for the last self.args[0] hours :params: :attr:`sites`: list of sites (when not given, take every sites) :returns: """ if not 'hours' in self.args: return S_ERROR('Number of hours not specified') hours = self.args['hours'] sites = None if 'sites' in self.args: sites = self.args['sites'] if sites is None: #FIXME: pointing to the CSHelper instead # sources = self.rsClient.getSite( meta = {'columns': 'SiteName'} ) # if not sources[ 'OK' ]: # return sources # sources = [ si[0] for si in sources[ 'Value' ] ] sites = Resources.getSites() if not sites['OK']: return sites sites = sites['Value'] if not sites: return S_ERROR('Sites is empty') fromD = datetime.utcnow() - timedelta(hours=hours) toD = datetime.utcnow() runJobs = self.rClient.getReport('WMSHistory', 'NumberOfJobs', fromD, toD, {}, 'Site') if not runJobs['OK']: return runJobs runJobs = runJobs['Value'] if not 'data' in runJobs: return S_ERROR('Missing data key') if not 'granularity' in runJobs: return S_ERROR('Missing granularity key') singlePlots = {} for site, value in runJobs['data'].items(): if site in sites: plot = {} plot['data'] = {site: value} plot['granularity'] = runJobs['granularity'] singlePlots[site] = plot return S_OK(singlePlots) ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
def beginExecution(self): self.gridEnv = self.am_getOption("GridEnv", getGridEnv()) # The SiteDirector is for a particular user community self.vo = self.am_getOption("Community", "") if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", "") # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result["OK"]: return result for group in result["Value"]: if "NormalUser" in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericPilotCredentials(vo=self.vo) if not result["OK"]: return result self.pilotDN, self.pilotGroup = result["Value"] self.pilotDN = self.am_getOption("PilotDN", self.pilotDN) self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup) self.platforms = [] self.sites = [] self.defaultSubmitPools = "" if self.group: self.defaultSubmitPools = Registry.getGroupOption(self.group, "SubmitPools", "") elif self.vo: self.defaultSubmitPools = Registry.getVOOption(self.vo, "SubmitPools", "") self.pilot = self.am_getOption("PilotScript", DIRAC_PILOT) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption("WorkDirectory") self.maxQueueLength = self.am_getOption("MaxQueueLength", 86400 * 3) self.pilotLogLevel = self.am_getOption("PilotLogLevel", "INFO") self.maxJobsInFillMode = self.am_getOption("MaxJobsInFillMode", self.maxJobsInFillMode) self.maxPilotsToSubmit = self.am_getOption("MaxPilotsToSubmit", self.maxPilotsToSubmit) self.pilotWaitingFlag = self.am_getOption("PilotWaitingFlag", True) self.pilotWaitingTime = self.am_getOption("MaxPilotWaitingTime", 7200) # Flags self.updateStatus = self.am_getOption("UpdatePilotStatus", True) self.getOutput = self.am_getOption("GetPilotOutput", True) self.sendAccounting = self.am_getOption("SendPilotAccounting", True) # Get the site description dictionary siteNames = None if not self.am_getOption("Site", "Any").lower() == "any": siteNames = self.am_getOption("Site", []) ceTypes = None if not self.am_getOption("CETypes", "Any").lower() == "any": ceTypes = self.am_getOption("CETypes", []) ces = None if not self.am_getOption("CEs", "Any").lower() == "any": ces = self.am_getOption("CEs", []) result = Resources.getQueues( community=self.vo, siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode="Direct" ) if not result["OK"]: return result resourceDict = result["Value"] result = self.getQueues(resourceDict) if not result["OK"]: return result # if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] # self.siteNames = siteNames if self.updateStatus: self.log.always("Pilot status update requested") if self.getOutput: self.log.always("Pilot output retrieval requested") if self.sendAccounting: self.log.always("Pilot accounting sending requested") self.log.always("Sites:", siteNames) self.log.always("CETypes:", ceTypes) self.log.always("CEs:", ces) self.log.always("PilotDN:", self.pilotDN) self.log.always("PilotGroup:", self.pilotGroup) self.log.always("MaxPilotsToSubmit:", self.maxPilotsToSubmit) self.log.always("MaxJobsInFillMode:", self.maxJobsInFillMode) self.localhost = socket.getfqdn() self.proxy = "" if self.queueDict: self.log.always("Agent will serve queues:") for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % (self.queueDict[queue]["Site"], self.queueDict[queue]["CEName"], queue) ) return S_OK()
def main(): global fullMatch global sites Script.registerSwitch("F", "full-match", "Check all the matching criteria", setFullMatch) Script.registerSwitch( "S:", "site=", "Check matching for these sites (comma separated list)", setSites) Script.registerArgument("job_JDL: file with job JDL description") _, args = Script.parseCommandLine(ignoreErrors=True) from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup from DIRAC.ConfigurationSystem.Client.Helpers import Resources from DIRAC.Core.Utilities.PrettyPrint import printTable from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result["OK"]: gLogger.error("No proxy found, please login") DIRACExit(-1) voName = result["Value"] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) siteDict = resultQueues["Value"] result = getQueuesResolved(siteDict, {}, checkPlatform=True) if not resultQueues["OK"]: gLogger.error("Failed to get CE information") DIRACExit(-1) queueDict = result["Value"] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask["OK"]: gLogger.error("Failed to get Site mask information") DIRACExit(-1) siteMaskList = resultMask.get("Value", []) rssClient = ResourceStatus() fields = ("Site", "CE", "Queue", "Status", "Match", "Reason") records = [] for queue, queueInfo in queueDict.items(): site = queueInfo["Site"] ce = queueInfo["CEName"] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result["OK"]: ceStatus = result["Value"][ce]["all"] result = matchQueue(jdl, queueInfo["ParametersDict"], fullMatch=fullMatch) if not result["OK"]: gLogger.error("Failed in getting match data", result["Message"]) DIRACExit(-1) status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive" if result["Value"]["Match"]: records.append( (site, ce, queueInfo["QueueName"], status, "Yes", "")) else: records.append((site, ce, queueInfo["QueueName"], status, "No", result["Value"]["Reason"])) gLogger.notice( printTable(fields, records, sortField="Site", columnSeparator=" ", printOut=False))
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: result = self._resources.getSiteFullName( site ) if not result['OK']: continue siteFullName = result['Value'] for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = siteFullName self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: self.queueDict[queueName]['BundleProxy'] = True if siteFullName not in self.sites: self.sites.append( siteFullName ) return S_OK()
from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue if __name__ == '__main__': with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result['OK']: gLogger.error('No proxy found, please login') DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1)
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get( 'Tag', [] ) pilotRunDirectory = ceDict.get( 'PilotRunDirectory', '' ) if isinstance( ceTags, basestring ): ceTags = fromChar( ceTags ) ceMaxRAM = ceDict.get( 'MaxRAM', None ) qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = site self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' ) if queueTags and isinstance( queueTags, basestring ): queueTags = fromChar( queueTags ) self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags if ceTags: if queueTags: allTags = list( set( ceTags + queueTags ) ) self.queueDict[queueName]['ParametersDict']['Tag'] = allTags else: self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags maxRAM = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM' ) maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.queueDict[queueName]['ParametersDict']['MaxRAM'] = maxRAM if pilotRunDirectory: self.queueDict[queueName]['ParametersDict']['JobExecDir'] = pilotRunDirectory qwDir = os.path.join( self.workingDirectory, queue ) mkDir(qwDir) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) # Generate the CE object for the queue or pick the already existing one # if the queue definition did not change queueHash = self.__generateQueueHash( ceQueueDict ) if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash: queueCE = self.queueCECache[queueName]['CE'] else: result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueCECache.setdefault( queueName, {} ) self.queueCECache[queueName]['Hash'] = queueHash self.queueCECache[queueName]['CE'] = result['Value'] queueCE = self.queueCECache[queueName]['CE'] self.queueDict[queueName]['CE'] = queueCE self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = site self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: if ceDict['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True if site not in self.sites: self.sites.append( site ) return S_OK()
def beginExecution( self ): self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() ) # The SiteDirector is for a particular user community self.vo = self.am_getOption( "VO", '' ) if not self.vo: self.vo = self.am_getOption( "Community", '' ) if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption( "Group", '' ) # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO( self.vo ) if not result['OK']: return result self.voGroups = [] for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup( group ): self.voGroups.append( group ) else: self.voGroups = [ self.group ] result = findGenericPilotCredentials( vo = self.vo ) if not result[ 'OK' ]: return result self.pilotDN, self.pilotGroup = result[ 'Value' ] self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN ) self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup ) self.defaultSubmitPools = getSubmitPools( self.group, self.vo ) self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT ) self.install = DIRAC_INSTALL self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES self.workingDirectory = self.am_getOption( 'WorkDirectory' ) self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 ) self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' ) self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode ) self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit ) self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True ) self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 ) self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 ) self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 ) self.addPilotsToEmptySites = self.am_getOption( 'AddPilotsToEmptySites', False ) # Flags self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True ) self.getOutput = self.am_getOption( 'GetPilotOutput', False ) self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True ) # Get the site description dictionary siteNames = None if not self.am_getOption( 'Site', 'Any' ).lower() == "any": siteNames = self.am_getOption( 'Site', [] ) if not siteNames: siteNames = None ceTypes = None if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any": ceTypes = self.am_getOption( 'CETypes', [] ) ces = None if not self.am_getOption( 'CEs', 'Any' ).lower() == "any": ces = self.am_getOption( 'CEs', [] ) if not ces: ces = None result = Resources.getQueues( community = self.vo, siteList = siteNames, ceList = ces, ceTypeList = ceTypes, mode = 'Direct' ) if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues( resourceDict ) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always( 'Pilot status update requested' ) if self.getOutput: self.log.always( 'Pilot output retrieval requested' ) if self.sendAccounting: self.log.always( 'Pilot accounting sending requested' ) self.log.always( 'Sites:', siteNames ) self.log.always( 'CETypes:', ceTypes ) self.log.always( 'CEs:', ces ) self.log.always( 'PilotDN:', self.pilotDN ) self.log.always( 'PilotGroup:', self.pilotGroup ) self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit ) self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode ) self.localhost = socket.getfqdn() self.proxy = '' if self.firstPass: if self.queueDict: self.log.always( "Agent will serve queues:" ) for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue ) ) self.firstPass = False return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), totalWaitingPilots)) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at site %s since no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots(queue) if totalSlots == 0: continue pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info("%d pilots submitted in total in this cycle" % totalSubmittedPilots) return S_OK()
def getQueues(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop("Queues") for queue in qDict: queueName = "%s_%s" % (ce, queue) self.queueDict[queueName] = {} self.queueDict[queueName]["ParametersDict"] = qDict[queue] self.queueDict[queueName]["ParametersDict"]["Queue"] = queue self.queueDict[queueName]["ParametersDict"]["Site"] = site self.queueDict[queueName]["ParametersDict"]["GridEnv"] = self.gridEnv self.queueDict[queueName]["ParametersDict"]["Setup"] = gConfig.getValue("/DIRAC/Setup", "unknown") # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if ( "maxCPUTime" in self.queueDict[queueName]["ParametersDict"] and "SI00" in self.queueDict[queueName]["ParametersDict"] ): maxCPUTime = float(self.queueDict[queueName]["ParametersDict"]["maxCPUTime"]) # For some sites there are crazy values in the CS maxCPUTime = max(maxCPUTime, 0) maxCPUTime = min(maxCPUTime, 86400 * 12.5) si00 = float(self.queueDict[queueName]["ParametersDict"]["SI00"]) queueCPUTime = 60.0 / 250.0 * maxCPUTime * si00 self.queueDict[queueName]["ParametersDict"]["CPUTime"] = int(queueCPUTime) qwDir = os.path.join(self.workingDirectory, queue) if not os.path.exists(qwDir): os.makedirs(qwDir) self.queueDict[queueName]["ParametersDict"]["WorkingDirectory"] = qwDir platform = "" if "Platform" in self.queueDict[queueName]["ParametersDict"]: platform = self.queueDict[queueName]["ParametersDict"]["Platform"] elif "Platform" in ceDict: platform = ceDict["Platform"] elif "OS" in ceDict: architecture = ceDict.get("architecture", "x86_64") OS = ceDict["OS"] platform = "_".join([architecture, OS]) if platform and not platform in self.platforms: self.platforms.append(platform) if not "Platform" in self.queueDict[queueName]["ParametersDict"] and platform: result = Resources.getDIRACPlatform(platform) if result["OK"]: self.queueDict[queueName]["ParametersDict"]["Platform"] = result["Value"] ceQueueDict = dict(ceDict) ceQueueDict.update(self.queueDict[queueName]["ParametersDict"]) result = ceFactory.getCE(ceName=ce, ceType=ceDict["CEType"], ceParametersDict=ceQueueDict) if not result["OK"]: return result self.queueDict[queueName]["CE"] = result["Value"] self.queueDict[queueName]["CEName"] = ce self.queueDict[queueName]["CEType"] = ceDict["CEType"] self.queueDict[queueName]["Site"] = site self.queueDict[queueName]["QueueName"] = queue result = self.queueDict[queueName]["CE"].isValid() if not result["OK"]: self.log.fatal(result["Message"]) return result if "BundleProxy" in self.queueDict[queueName]["ParametersDict"]: self.queueDict[queueName]["BundleProxy"] = True elif "BundleProxy" in ceDict: self.queueDict[queueName]["BundleProxy"] = True if site not in self.sites: self.sites.append(site) return S_OK()
def export_getSites( self ): ''' Returns list of all sites considered by RSS ''' gLogger.info( 'getSites' ) return Resources.getSites()
global ceName ceName = args def setSite( args ): global Site Site = args def setQueue( args ): global Queue Queue = args Script.registerSwitch( "N:", "Name=", "Computing Element Name (Mandatory)", setCEName ) Script.registerSwitch( "S:", "Site=", "Site Name (Mandatory)", setSite ) Script.registerSwitch( "Q:", "Queue=", "Queue Name (Mandatory)", setQueue ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getExtraCLICFGFiles() if len( args ) > 1: Script.showHelp() exit( -1 ) result = Resources.getQueue( Site, ceName, Queue ) if not result['OK']: gLogger.error( "Could not retrieve resource parameters", ": " + result['Message'] ) DIRACExit( 1 ) gLogger.notice( json.dumps( result['Value'] ) )
def getImages(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.imageDict = {} ceFactory = EndpointFactory() result = getPilotBootstrapParameters(vo=self.vo, runningPod=self.runningPod) if not result['OK']: return result opParameters = result['Value'] for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get('Tag', []) if isinstance(ceTags, basestring): ceTags = fromChar(ceTags) ceMaxRAM = ceDict.get('MaxRAM', None) qDict = ceDict.pop('Images') for image in qDict: imageName = '%s_%s' % (ce, image) self.imageDict[imageName] = {} self.imageDict[imageName]['ParametersDict'] = qDict[image] self.imageDict[imageName]['ParametersDict'][ 'Image'] = image self.imageDict[imageName]['ParametersDict']['Site'] = site self.imageDict[imageName]['ParametersDict'][ 'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown') self.imageDict[imageName]['ParametersDict'][ 'CPUTime'] = 99999999 imageTags = self.imageDict[imageName][ 'ParametersDict'].get('Tag') if imageTags and isinstance(imageTags, basestring): imageTags = fromChar(imageTags) self.imageDict[imageName]['ParametersDict'][ 'Tag'] = imageTags if ceTags: if imageTags: allTags = list(set(ceTags + imageTags)) self.imageDict[imageName]['ParametersDict'][ 'Tag'] = allTags else: self.imageDict[imageName]['ParametersDict'][ 'Tag'] = ceTags maxRAM = self.imageDict[imageName]['ParametersDict'].get( 'MaxRAM') maxRAM = ceMaxRAM if not maxRAM else maxRAM if maxRAM: self.imageDict[imageName]['ParametersDict'][ 'MaxRAM'] = maxRAM platform = '' if "Platform" in self.imageDict[imageName][ 'ParametersDict']: platform = self.imageDict[imageName]['ParametersDict'][ 'Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] if platform and not platform in self.platforms: self.platforms.append(platform) if not "Platform" in self.imageDict[imageName][ 'ParametersDict'] and platform: result = Resources.getDIRACPlatform(platform) if result['OK']: self.imageDict[imageName]['ParametersDict'][ 'Platform'] = result['Value'][0] ceImageDict = dict(ceDict) ceImageDict['CEName'] = ce ceImageDict['VO'] = self.vo ceImageDict['Image'] = image ceImageDict['RunningPod'] = self.runningPod ceImageDict['CSServers'] = gConfig.getValue( "/DIRAC/Configuration/Servers", []) ceImageDict.update( self.imageDict[imageName]['ParametersDict']) ceImageDict.update(opParameters) # Generate the CE object for the image or pick the already existing one # if the image definition did not change imageHash = self.__generateImageHash(ceImageDict) if imageName in self.imageCECache and self.imageCECache[ imageName]['Hash'] == imageHash: imageCE = self.imageCECache[imageName]['CE'] else: result = ceFactory.getCEObject(parameters=ceImageDict) if not result['OK']: return result self.imageCECache.setdefault(imageName, {}) self.imageCECache[imageName]['Hash'] = imageHash self.imageCECache[imageName]['CE'] = result['Value'] imageCE = self.imageCECache[imageName]['CE'] self.imageDict[imageName]['CE'] = imageCE self.imageDict[imageName]['CEName'] = ce self.imageDict[imageName]['CEType'] = ceDict['CEType'] self.imageDict[imageName]['Site'] = site self.imageDict[imageName]['ImageName'] = image self.imageDict[imageName]['Platform'] = platform self.imageDict[imageName]['MaxInstances'] = ceDict[ 'MaxInstances'] if not self.imageDict[imageName]['CE'].isValid(): self.log.error( 'Failed to instantiate CloudEndpoint for %s' % imageName) continue if site not in self.sites: self.sites.append(site) return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tqDict['Tag'] = [] self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add( site ) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add( site ) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None ) totalWaitingPilots = 0 if result['OK']: totalWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len( tqIDList ), totalWaitingPilots ) ) #if totalWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR( 'Can not get the site mask' ) siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle( queues ) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues.setdefault( queue, 0 ) % self.failedQueueCycleFactor if failedCount != 0: self.log.warn( "%s queue failed recently, skipping %d cycles" % ( queue, 10-failedCount ) ) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] siteMask = siteName in siteMaskList if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName) ) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName) ) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName #if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose( 'No matching TQs found for %s' % queue ) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len( tqIDList ), queue) ) # Get the number of already waiting pilots for these task queues totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) if totalWaitingPilots >= totalTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % totalWaitingPilots ) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (totalWaitingPilots, totalTQJobs, queue) ) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue totalSlots = self.__getQueueSlots( queue ) if totalSlots == 0: self.log.debug( '%s: No slots available' % queue ) continue pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len( pilotList ) totalSubmittedPilots += len( pilotList ) self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % ( totalSubmittedPilots, matchedQueues ) ) return S_OK()
def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools} if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots({'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS}, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info('Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus(ceName, "ComputingElement") if not result['OK']: self.log.error("Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose("Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int(self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn('CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error('Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq]['Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose('%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots({'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS}, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose('Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose("%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose("%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max(0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info('%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % (queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit)) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get('JobExecDir', jobExecDir) executable, pilotSubmissionChunk = self.getExecutable(queue, pilotsToSubmit, bundleProxy=bundleProxy, jobExecDir=jobExecDir, processors=processors) result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if 'PilotStampDict' in result: stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if tqID not in tqDict: tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference(pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, stampDict) if not result['OK']: self.log.error('Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus(pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def main(): global fullMatch global sites Script.registerSwitch("F", "full-match", "Check all the matching criteria", setFullMatch) Script.registerSwitch( "S:", "site=", "Check matching for these sites (comma separated list)", setSites) Script.parseCommandLine(ignoreErrors=True) args = Script.getPositionalArgs() if len(args) == 0: gLogger.error("Error: No job description provided") Script.showHelp(exitCode=1) from DIRAC.Core.Security.ProxyInfo import getVOfromProxyGroup from DIRAC.ConfigurationSystem.Client.Helpers import Resources from DIRAC.Core.Utilities.PrettyPrint import printTable from DIRAC.ResourceStatusSystem.Client.ResourceStatus import ResourceStatus from DIRAC.ResourceStatusSystem.Client.SiteStatus import SiteStatus from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved, matchQueue with open(args[0]) as f: jdl = f.read() # Get the current VO result = getVOfromProxyGroup() if not result['OK']: gLogger.error('No proxy found, please login') DIRACExit(-1) voName = result['Value'] resultQueues = Resources.getQueues(siteList=sites, community=voName) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) siteDict = resultQueues['Value'] result = getQueuesResolved(siteDict) if not resultQueues['OK']: gLogger.error('Failed to get CE information') DIRACExit(-1) queueDict = result['Value'] # get list of usable sites within this cycle resultMask = SiteStatus().getUsableSites() if not resultMask['OK']: gLogger.error('Failed to get Site mask information') DIRACExit(-1) siteMaskList = resultMask.get('Value', []) rssClient = ResourceStatus() fields = ('Site', 'CE', 'Queue', 'Status', 'Match', 'Reason') records = [] for queue, queueInfo in queueDict.items(): site = queueInfo['Site'] ce = queueInfo['CEName'] siteStatus = "Active" if site in siteMaskList else "InActive" ceStatus = siteStatus if rssClient.rssFlag: result = rssClient.getElementStatus(ce, "ComputingElement") if result['OK']: ceStatus = result['Value'][ce]['all'] result = matchQueue(jdl, queueInfo, fullMatch=fullMatch) if not result['OK']: gLogger.error('Failed in getting match data', result['Message']) DIRACExit(-1) status = "Active" if siteStatus == "Active" and ceStatus == "Active" else "Inactive" if result['Value']['Match']: records.append((site, ce, queueInfo['Queue'], status, 'Yes', '')) else: records.append((site, ce, queueInfo['Queue'], status, 'No', result['Value']['Reason'])) gLogger.notice( printTable(fields, records, sortField='Site', columnSeparator=' ', printOut=False))
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {"Setup": setup, "CPUTime": 9999999, "SubmitPool": self.defaultSubmitPools} if self.vo: tqDict["Community"] = self.vo if self.voGroups: tqDict["OwnerGroup"] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: return result tqDict["Platform"] = result["Value"] tqDict["Site"] = self.sites self.log.verbose("Checking overall TQ availability with requirements") self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result["OK"]: return result if not result["Value"]: self.log.verbose("No Waiting jobs suitable for the director") return S_OK() # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result["OK"]: return S_ERROR("Can not get the site mask") siteMaskList = result["Value"] queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]["CE"] ceName = self.queueDict[queue]["CEName"] ceType = self.queueDict[queue]["CEType"] queueName = self.queueDict[queue]["QueueName"] siteName = self.queueDict[queue]["Site"] siteMask = siteName in siteMaskList if "CPUTime" in self.queueDict[queue]["ParametersDict"]: queueCPUTime = int(self.queueDict[queue]["ParametersDict"]["CPUTime"]) else: self.log.warn("CPU time limit is not specified for queue %s, skipping..." % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime) if not result["OK"]: return result self.proxy = result["Value"] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result["OK"]: self.log.warn("Failed to check the availability of queue %s: \n%s" % (queue, result["Message"])) continue ceInfoDict = result["CEInfoDict"] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % ( ceName, queueName, ceInfoDict["WaitingJobs"], ceInfoDict["RunningJobs"], ceInfoDict["SubmittedJobs"], ceInfoDict["MaxTotalJobs"], ) ) totalSlots = result["Value"] ceDict = ce.getParameterDict() ceDict["GridCE"] = ceName if not siteMask and "Site" in ceDict: self.log.info("Site not in the mask %s" % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict["Site"] if self.vo: ceDict["Community"] = self.vo if self.voGroups: ceDict["OwnerGroup"] = self.voGroups # This is a hack to get rid of ! ceDict["SubmitPool"] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(self.platforms) if not result["OK"]: continue ceDict["Platform"] = result["Value"] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result["OK"]: self.log.error("Could not retrieve TaskQueues from TaskQueueDB", result["Message"]) return result taskQueueDict = result["Value"] if not taskQueueDict: self.log.info("No matching TQs found") continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]["Jobs"] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( {"TaskQueueID": tqIDList, "Status": WAITING_PILOT_STATUS}, None, lastUpdateTime ) if not result["OK"]: self.log.error("Failed to get Number of Waiting pilots", result["Message"]) totalWaitingPilots = 0 else: totalWaitingPilots = result["Value"] self.log.verbose("Waiting Pilots for TaskQueue %s:" % tqIDList, totalWaitingPilots) pilotsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( "Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d" % (totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info("Going to submit %d pilots to %s queue" % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get("BundleProxy", False) jobExecDir = "" if ceType == "CREAM": jobExecDir = "." jobExecDir = self.queueDict[queue].get("JobExecDir", jobExecDir) httpProxy = self.queueDict[queue].get("HttpProxy", "") result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result["OK"]: return result executable, pilotSubmissionChunk = result["Value"] result = ce.submitJob(executable, "", pilotSubmissionChunk) os.unlink(executable) if not result["OK"]: self.log.error("Failed submission to queue %s:\n" % queue, result["Message"]) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result["Value"] self.log.info("Submitted %d pilots to %s@%s" % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key("PilotStampDict"): stampDict = result["PilotStampDict"] tqPriorityList = [] sumPriority = 0.0 for tq in taskQueueDict: sumPriority += taskQueueDict[tq]["Priority"] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, "", stampDict ) if not result["OK"]: self.log.error("Failed add pilots to the PilotAgentsDB: ", result["Message"]) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, "Submitted", ceName, "Successfully submitted by the SiteDirector", siteName, queueName, ) if not result["OK"]: self.log.error("Failed to set pilot status: ", result["Message"]) continue return S_OK()
def setSite( args ): global Site Site = args def setQueue( args ): global Queue Queue = args Script.registerSwitch( "N:", "Name=", "Computing Element Name (Mandatory)", setCEName ) Script.registerSwitch( "S:", "Site=", "Site Name (Mandatory)", setSite ) Script.registerSwitch( "Q:", "Queue=", "Queue Name (Mandatory)", setQueue ) Script.parseCommandLine( ignoreErrors = True ) args = Script.getExtraCLICFGFiles() if len( args ) > 1: Script.showHelp() exit( -1 ) result = Resources.getQueue( Site, ceName, Queue ) if not result['OK']: gLogger.error( "Could not retrieve resource parameters", ": " + result['Message'] ) DIRACExit( 1 ) gLogger.notice( json.dumps( result['Value'] ) )
def createVMs(self): """ Go through defined computing elements and submit jobs if necessary """ images = self.imageDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = {'Setup': setup, 'CPUTime': 9999999} if self.vo: tqDict['VO'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for image in images: if 'Tags' in self.imageDict[image]['ParametersDict']: tags += self.imageDict[image]['ParametersDict']['Tags'] tqDict['Tag'] = list(set(tags)) tqDict['SubmitPool'] = "mpdPool" self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() result = virtualMachineDB.getInstanceCounters('Status', {}) totalVMs = 0 if result['OK']: for status in result['Value']: if status in ['New', 'Submitted', 'Running']: totalVMs += result['Value'][status] self.log.info('Total %d jobs in %d task queues with %d VMs' % (totalWaitingJobs, len(tqIDList), totalVMs)) # Check if the site is allowed in the mask result = jobDB.getSiteMask() if not result['OK']: return S_ERROR('Can not get the site mask') siteMaskList = result['Value'] images = self.imageDict.keys() random.shuffle(images) totalSubmittedPilots = 0 matchedQueues = 0 for image in images: # Check if the image failed previously #failedCount = self.failedImages[ image ] % self.failedImageCycleFactor #if failedCount != 0: # self.log.warn( "%s queue failed recently, skipping %d cycles" % ( image, 10-failedCount ) ) # self.failedImages[image] += 1 # continue #print "AT >>> image parameters:", image #for key,value in self.imageDict[image].items(): # print key,value ce = self.imageDict[image]['CE'] ceName = self.imageDict[image]['CEName'] imageName = self.imageDict[image]['ImageName'] siteName = self.imageDict[image]['Site'] platform = self.imageDict[image]['Platform'] imageTags = self.imageDict[image]['ParametersDict'].get('Tags', []) siteMask = siteName in siteMaskList endpoint = "%s::%s" % (siteName, ceName) maxInstances = int(self.imageDict[image]['MaxInstances']) processorTags = [] for tag in imageTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) # vms support WholeNode naturally processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (imageName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose("Skipping queue %s: site %s not in the mask" % (imageName, siteName)) continue if 'CPUTime' in self.imageDict[image]['ParametersDict']: imageCPUTime = int( self.imageDict[image]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % image) continue # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['VO'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = processorTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % image) continue matchedQueues += 1 totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), image)) # Get the number of already instantiated VMs for these task queues totalWaitingVMs = 0 result = virtualMachineDB.getInstanceCounters( 'Status', {'Endpoint': endpoint}) if result['OK']: for status in result['Value']: if status in ['New', 'Submitted']: totalWaitingVMs += result['Value'][status] if totalWaitingVMs >= totalTQJobs: self.log.verbose("%d VMs already for all the available jobs" % totalWaitingVMs) self.log.verbose( "%d VMs for the total of %d eligible jobs for %s" % (totalWaitingVMs, totalTQJobs, image)) # Get the working proxy self.log.verbose("Getting cloud proxy for %s/%s" % (self.cloudDN, self.cloudGroup)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.cloudDN, self.cloudGroup, 3600) if not result['OK']: return result self.proxy = result['Value'] #ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/endpoint totalSlots = self.getVMInstances(endpoint, maxInstances) if totalSlots == 0: self.log.debug('%s: No slots available' % image) continue vmsToSubmit = max(0, min(totalSlots, totalTQJobs - totalWaitingVMs)) self.log.info( '%s: Slots=%d, TQ jobs=%d, VMs: %d, to submit=%d' % \ ( image, totalSlots, totalTQJobs, totalWaitingVMs, vmsToSubmit ) ) # Limit the number of VM instances to create to vmsToSubmit vmsToSubmit = min(self.maxVMsToSubmit, vmsToSubmit) self.log.info('Going to submit %d VMs to %s queue' % (vmsToSubmit, image)) result = ce.createInstances(vmsToSubmit) #result = S_OK() if not result['OK']: self.log.error('Failed submission to queue %s:\n' % image, result['Message']) self.failedImages.setdefault(image, 0) self.failedImages[image] += 1 continue # Add VMs to the VirtualMachineDB vmDict = result['Value'] totalSubmittedPilots += len(vmDict) self.log.info('Submitted %d VMs to %s@%s' % (len(vmDict), imageName, ceName)) pilotList = [] for uuID in vmDict: diracUUID = vmDict[uuID]['InstanceID'] endpoint = '%s::%s' % (self.imageDict[image]['Site'], ceName) result = virtualMachineDB.insertInstance( uuID, imageName, diracUUID, endpoint, self.vo) if not result['OK']: continue for ncpu in range(vmDict[uuID]['NumberOfCPUs']): pRef = 'vm://' + ceName + '/' + diracUUID + ':' + str( ncpu).zfill(2) pilotList.append(pRef) stampDict = {} tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, '', '', self.localhost, 'Cloud', stampDict) if not result['OK']: self.log.error( 'Failed to insert pilots into the PilotAgentsDB') self.log.info( "%d VMs submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: for ce in resourceDict[site]: ceDict = resourceDict[site][ce] ceTags = ceDict.get( 'Tag' ) if isinstance( ceTags, basestring ): ceTags = fromChar( ceTags ) qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = site self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) queueTags = self.queueDict[queueName]['ParametersDict'].get( 'Tag' ) if queueTags and isinstance( queueTags, basestring ): queueTags = fromChar( queueTags ) self.queueDict[queueName]['ParametersDict']['Tag'] = queueTags if ceTags: if queueTags: allTags = list( set( ceTags + queueTags ) ) self.queueDict[queueName]['ParametersDict']['Tag'] = allTags else: self.queueDict[queueName]['ParametersDict']['Tag'] = ceTags maxMemory = self.queueDict[queueName]['ParametersDict'].get( 'MaxRAM', None ) if maxMemory: # MaxRAM value is supposed to be in MB maxMemoryList = range( 1, int( maxMemory )/1000 + 1 ) memoryTags = [ '%dGB' % mem for mem in maxMemoryList ] if memoryTags: self.queueDict[queueName]['ParametersDict'].setdefault( 'Tag', [] ) self.queueDict[queueName]['ParametersDict']['Tag'] += memoryTags qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'][0] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) # Generate the CE object for the queue or pick the already existing one # if the queue definition did not change queueHash = self.__generateQueueHash( ceQueueDict ) if queueName in self.queueCECache and self.queueCECache[queueName]['Hash'] == queueHash: queueCE = self.queueCECache[queueName]['CE'] else: result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueCECache.setdefault( queueName, {} ) self.queueCECache[queueName]['Hash'] = queueHash self.queueCECache[queueName]['CE'] = result['Value'] queueCE = self.queueCECache[queueName]['CE'] self.queueDict[queueName]['CE'] = queueCE self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = site self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: if self.queueDict[queueName]['ParametersDict']['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: if ceDict['BundleProxy'].lower() in ['true','yes','1']: self.queueDict[queueName]['BundleProxy'] = True if site not in self.sites: self.sites.append( site ) return S_OK()
def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ queues = self.queueDict.keys() # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites tags = [] for queue in queues: tags += self.queueDict[queue]['ParametersDict']['Tag'] tqDict['Tag'] = list(set(tags)) self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() jobSites = set() anySite = False testSites = set() totalWaitingJobs = 0 for tqID in result['Value']: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': jobSites.add(site) else: anySite = True else: anySite = True if "JobTypes" in result['Value'][tqID]: if "Sites" in result['Value'][tqID]: for site in result['Value'][tqID]['Sites']: if site.lower() != 'any': testSites.add(site) totalWaitingJobs += result['Value'][tqID]['Jobs'] tqIDList = result['Value'].keys() self.log.info(tqIDList) result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None) tagWaitingPilots = 0 if result['OK']: tagWaitingPilots = result['Value'] self.log.info( 'Total %d jobs in %d task queues with %d waiting pilots' % (totalWaitingJobs, len(tqIDList), tagWaitingPilots)) self.log.info('Queues: ', self.queueDict.keys()) # if tagWaitingPilots >= totalWaitingJobs: # self.log.info( 'No more pilots to be submitted in this cycle' ) # return S_OK() result = self.siteClient.getUsableSites() if not result['OK']: return result siteMaskList = result['Value'] queues = self.queueDict.keys() random.shuffle(queues) totalSubmittedPilots = 0 matchedQueues = 0 for queue in queues: # Check if the queue failed previously failedCount = self.failedQueues[queue] % self.failedQueueCycleFactor if failedCount != 0: self.log.warn("%s queue failed recently, skipping %d cycles" % (queue, 10 - failedCount)) self.failedQueues[queue] += 1 continue ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] platform = self.queueDict[queue]['Platform'] queueTags = self.queueDict[queue]['ParametersDict']['Tag'] siteMask = siteName in siteMaskList processorTags = [] # Check the status of the Site result = self.siteClient.getUsableSites(siteName) if not result['OK']: self.log.error("Can not get the status of site %s: %s" % (siteName, result['Message'])) continue if siteName not in result.get('Value', []): self.log.info("site %s is not active" % siteName) continue if self.rssFlag: # Check the status of the ComputingElement result = self.rssClient.getElementStatus( ceName, "ComputingElement") if not result['OK']: self.log.error( "Can not get the status of computing element", " %s: %s" % (siteName, result['Message'])) continue if result['Value']: # get the value of the status result = result['Value'][ceName]['all'] if result not in ('Active', 'Degraded'): self.log.verbose( "Skipping computing element %s at %s: resource not usable" % (ceName, siteName)) continue for tag in queueTags: if re.match(r'^[0-9]+Processors$', tag): processorTags.append(tag) if 'WholeNode' in queueTags: processorTags.append('WholeNode') if not anySite and siteName not in jobSites: self.log.verbose( "Skipping queue %s at %s: no workload expected" % (queueName, siteName)) continue if not siteMask and siteName not in testSites: self.log.verbose( "Skipping queue %s at site %s not in the mask" % (queueName, siteName)) continue if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Prepare the queue description to look for eligible jobs ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName # if not siteMask and 'Site' in ceDict: # self.log.info( 'Site not in the mask %s' % siteName ) # self.log.info( 'Removing "Site" from matching Dict' ) # del ceDict[ 'Site' ] if not siteMask: ceDict['JobType'] = "Test" if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] ceDict['Tag'] = queueTags # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.verbose('No matching TQs found for %s' % queue) continue matchedQueues += 1 totalTQJobs = 0 totalTQJobsByProcessors = {} tqIDList = taskQueueDict.keys() tqIDListByProcessors = {} for tq in taskQueueDict: if 'Tags' not in taskQueueDict[tq]: # skip non multiprocessor tqs continue for tag in taskQueueDict[tq]['Tags']: if tag in processorTags: tqIDListByProcessors.setdefault(tag, []) tqIDListByProcessors[tag].append(tq) totalTQJobsByProcessors.setdefault(tag, 0) totalTQJobsByProcessors[tag] += taskQueueDict[tq][ 'Jobs'] totalTQJobs += taskQueueDict[tq]['Jobs'] self.log.verbose( '%d job(s) from %d task queue(s) are eligible for %s queue' % (totalTQJobs, len(tqIDList), queue)) queueSubmittedPilots = 0 for tag in tqIDListByProcessors: self.log.verbose("Try to submit pilots for Tag=%s (TQs=%s)" % (tag, tqIDListByProcessors[tag])) processors = 1 m = re.match(r'^(?P<processors>[0-9]+)Processors$', tag) if m: processors = int(m.group('processors')) if tag == 'WholeNode': processors = -1 tagTQJobs = totalTQJobsByProcessors[tag] tagTqIDList = tqIDListByProcessors[tag] # Get the number of already waiting pilots for these task queues tagWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime( ) - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tagTqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message']) tagWaitingPilots = 0 else: tagWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tagTqIDList, tagWaitingPilots) if tagWaitingPilots >= tagTQJobs: self.log.verbose( "%d waiting pilots already for all the available jobs" % tagWaitingPilots) continue self.log.verbose( "%d waiting pilots for the total of %d eligible jobs for %s" % (tagWaitingPilots, tagTQJobs, queue)) # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue totalSlots = self.getQueueSlots(queue, False) if totalSlots == 0: self.log.debug('%s: No slots available' % queue) continue # Note: comparing slots to job numbers is not accurate in multiprocessor case. # This could lead to over submission. pilotsToSubmit = max( 0, min(totalSlots, tagTQJobs - tagWaitingPilots)) self.log.info( '%s: Slots=%d, TQ jobs=%d, Pilots: waiting %d, to submit=%d' % \ ( queue, totalSlots, tagTQJobs, tagWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit - queueSubmittedPilots, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False) jobExecDir = '' jobExecDir = self.queueDict[queue]['ParametersDict'].get( 'JobExecDir', jobExecDir) httpProxy = self.queueDict[queue]['ParametersDict'].get( 'HttpProxy', '') result = self.getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk, processors=processors) # ## FIXME: The condor thing only transfers the file with some # ## delay, so when we unlink here the script is gone # ## FIXME 2: but at some time we need to clean up the pilot wrapper scripts... if ceType != 'HTCondorCE': os.unlink(executable) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 self.failedQueues[queue] += 1 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk queueSubmittedPilots += pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.queueSlots[queue]['AvailableSlots'] -= len(pilotList) totalSubmittedPilots += len(pilotList) self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in tagTqIDList: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue self.log.info( "%d pilots submitted in total in this cycle, %d matched queues" % (totalSubmittedPilots, matchedQueues)) return S_OK()
def beginExecution( self ): self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() ) # The SiteDirector is for a particular user community self.vo = self.am_getOption( "VO", '' ) if not self.vo: self.vo = self.am_getOption( "Community", '' ) if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption( "Group", '' ) # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO( self.vo ) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup( group ): self.voGroups.append( group ) else: self.voGroups = [ self.group ] result = findGenericPilotCredentials( vo = self.vo ) if not result[ 'OK' ]: return result self.pilotDN, self.pilotGroup = result[ 'Value' ] self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN ) self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup ) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' ) elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' ) self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT ) self.install = DIRAC_INSTALL self.extraModules = self.am_getOption( 'ExtraPilotModules', [] ) + DIRAC_MODULES self.workingDirectory = self.am_getOption( 'WorkDirectory' ) self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 ) self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' ) self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode ) self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit ) self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True ) self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 3600 ) self.failedQueueCycleFactor = self.am_getOption( 'FailedQueueCycleFactor', 10 ) self.pilotStatusUpdateCycleFactor = self.am_getOption( 'PilotStatusUpdateCycleFactor', 10 ) # Flags self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True ) self.getOutput = self.am_getOption( 'GetPilotOutput', True ) self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True ) # Get the site description dictionary siteNames = None if not self.am_getOption( 'Site', 'Any' ).lower() == "any": siteNames = self.am_getOption( 'Site', [] ) if not siteNames: siteNames = None ceTypes = None if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any": ceTypes = self.am_getOption( 'CETypes', [] ) ces = None if not self.am_getOption( 'CEs', 'Any' ).lower() == "any": ces = self.am_getOption( 'CEs', [] ) if not ces: ces = None result = Resources.getQueues( community = self.vo, siteList = siteNames, ceList = ces, ceTypeList = ceTypes, mode = 'Direct' ) if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues( resourceDict ) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always( 'Pilot status update requested' ) if self.getOutput: self.log.always( 'Pilot output retrieval requested' ) if self.sendAccounting: self.log.always( 'Pilot accounting sending requested' ) self.log.always( 'Sites:', siteNames ) self.log.always( 'CETypes:', ceTypes ) self.log.always( 'CEs:', ces ) self.log.always( 'PilotDN:', self.pilotDN ) self.log.always( 'PilotGroup:', self.pilotGroup ) self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit ) self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode ) self.localhost = socket.getfqdn() self.proxy = '' if self.firstPass: if self.queueDict: self.log.always( "Agent will serve queues:" ) for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue ) ) self.firstPass = False return S_OK()
def doCommand( self ): """ # Returns simple pilots efficiency # # :attr:`args`: # - args[0]: string - should be a ValidElement # # - args[1]: string - should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } """ if not 'element' in self.args: return self.returnERROR( S_ERROR( 'element is missing' ) ) element = self.args[ 'element' ] if not 'siteName' in self.args: return self.returnERROR( S_ERROR( 'siteName is missing' ) ) siteName = self.args[ 'siteName' ] # If siteName is None, we take all sites if siteName is None: siteName = Resources.getSites() if not siteName[ 'OK' ]: return self.returnERROR( siteName ) siteName = siteName[ 'Value' ] if element == 'Site': results = self.wmsAdmin.getPilotSummaryWeb( { 'GridSite' : siteName }, [], 0, 300 ) elif element == 'Resource': results = self.wmsAdmin.getPilotSummaryWeb( { 'ExpandSite' : siteName }, [], 0, 300 ) else: return self.returnERROR( S_ERROR( '%s is a wrong element' % element ) ) if not results[ 'OK' ]: return self.returnERROR( results ) results = results[ 'Value' ] if not 'ParameterNames' in results: return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) params = results[ 'ParameterNames' ] if not 'Records' in results: return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) records = results[ 'Records' ] pilotResults = [] for record in records: pilotDict = dict( zip( params , record )) try: pilotDict[ 'PilotsPerJob' ] = float( pilotDict[ 'PilotsPerJob' ] ) pilotDict[ 'PilotsJobEff' ] = float( pilotDict[ 'PilotsJobEff' ] ) except KeyError, e: return self.returnERROR( S_ERROR( e ) ) except ValueError, e: return self.returnERROR( S_ERROR( e ) )