def _updateSiteMask( self, sitesData ): siteStatus = SiteStatus() siteMaskStatus = dict( sitesData ) for site in siteMaskStatus: # #FIXME: we are only taking into account ComputingAccess # if siteStatus.isUsableSite( site, 'ComputingAccess' ): siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Allowed' else: siteMaskStatus[ site ][ 'siteMaskStatus' ] = 'Banned' sitesData[ site ][ 'siteMaskStatus' ] = siteMaskStatus[ site ][ 'siteMaskStatus' ] return S_OK( sitesData )
def _updateSiteMask(self, sitesData): siteStatus = SiteStatus() siteMaskStatus = dict(sitesData) for site in siteMaskStatus: # #FIXME: we are only taking into account ComputingAccess # if siteStatus.isUsableSite(site, 'ComputingAccess'): siteMaskStatus[site]['siteMaskStatus'] = 'Allowed' else: siteMaskStatus[site]['siteMaskStatus'] = 'Banned' sitesData[site]['siteMaskStatus'] = siteMaskStatus[site][ 'siteMaskStatus'] return S_OK(sitesData)
class SiteDirector(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize(self): """ Standard constructor """ self.am_setOption("PollingTime", 60.0) self.am_setOption("maxPilotWaitingHours", 6) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK() def beginExecution(self): self.gridEnv = self.am_getOption("GridEnv", getGridEnv()) # The SiteDirector is for a particular user community self.vo = self.am_getOption("Community", '') if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption("Group", '') # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO(self.vo) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup(group): self.voGroups.append(group) else: self.voGroups = [self.group] result = findGenericPilotCredentials(vo=self.vo) if not result['OK']: return result self.pilotDN, self.pilotGroup = result['Value'] self.pilotDN = self.am_getOption("PilotDN", self.pilotDN) self.pilotGroup = self.am_getOption("PilotGroup", self.pilotGroup) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '') elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '') self.pilot = self.am_getOption('PilotScript', DIRAC_PILOT) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption('WorkDirectory') self.maxQueueLength = self.am_getOption('MaxQueueLength', 86400 * 3) self.pilotLogLevel = self.am_getOption('PilotLogLevel', 'INFO') self.maxJobsInFillMode = self.am_getOption('MaxJobsInFillMode', self.maxJobsInFillMode) self.maxPilotsToSubmit = self.am_getOption('MaxPilotsToSubmit', self.maxPilotsToSubmit) self.pilotWaitingFlag = self.am_getOption('PilotWaitingFlag', True) self.pilotWaitingTime = self.am_getOption('MaxPilotWaitingTime', 7200) # Flags self.updateStatus = self.am_getOption('UpdatePilotStatus', True) self.getOutput = self.am_getOption('GetPilotOutput', True) self.sendAccounting = self.am_getOption('SendPilotAccounting', True) # Get the site description dictionary siteNames = None if not self.am_getOption('Site', 'Any').lower() == "any": siteNames = self.am_getOption('Site', []) ceTypes = None if not self.am_getOption('CETypes', 'Any').lower() == "any": ceTypes = self.am_getOption('CETypes', []) ces = None if not self.am_getOption('CEs', 'Any').lower() == "any": ces = self.am_getOption('CEs', []) self._resources = Resources.Resources(vo=self.vo) result = self._resources.getEligibleQueuesInfo(siteList=siteNames, ceList=ces, ceTypeList=ceTypes, mode='Direct') if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues(resourceDict) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always('Pilot status update requested') if self.getOutput: self.log.always('Pilot output retrieval requested') if self.sendAccounting: self.log.always('Pilot accounting sending requested') self.log.always('Sites:', siteNames) self.log.always('CETypes:', ceTypes) self.log.always('CEs:', ces) self.log.always('PilotDN:', self.pilotDN) self.log.always('PilotGroup:', self.pilotGroup) self.log.always('MaxPilotsToSubmit:', self.maxPilotsToSubmit) self.log.always('MaxJobsInFillMode:', self.maxJobsInFillMode) self.localhost = socket.getfqdn() self.proxy = '' if self.queueDict: self.log.always("Agent will serve queues:") for queue in self.queueDict: self.log.always("Site: %s, CE: %s, Queue: %s" % (self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue)) return S_OK() def getQueues(self, resourceDict): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: result = self._resources.getSiteFullName(site) if not result['OK']: continue siteFullName = result['Value'] for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop('Queues') for queue in qDict: queueName = '%s_%s' % (ce, queue) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict'][ 'Queue'] = queue self.queueDict[queueName]['ParametersDict'][ 'Site'] = siteFullName self.queueDict[queueName]['ParametersDict'][ 'GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict'][ 'Setup'] = gConfig.getValue('/DIRAC/Setup', 'unknown') # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float(self.queueDict[queueName] ['ParametersDict']['maxCPUTime']) # For some sites there are crazy values in the CS maxCPUTime = max(maxCPUTime, 0) maxCPUTime = min(maxCPUTime, 86400 * 12.5) si00 = float(self.queueDict[queueName] ['ParametersDict']['SI00']) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict'][ 'CPUTime'] = int(queueCPUTime) qwDir = os.path.join(self.workingDirectory, queue) if not os.path.exists(qwDir): os.makedirs(qwDir) self.queueDict[queueName]['ParametersDict'][ 'WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName][ 'ParametersDict']: platform = self.queueDict[queueName]['ParametersDict'][ 'Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get('architecture', 'x86_64') OS = ceDict['OS'] platform = '_'.join([architecture, OS]) if platform and not platform in self.platforms: self.platforms.append(platform) if not "Platform" in self.queueDict[queueName][ 'ParametersDict'] and platform: result = Resources.getDIRACPlatform(platform) if result['OK']: self.queueDict[queueName]['ParametersDict'][ 'Platform'] = result['Value'] ceQueueDict = dict(ceDict) ceQueueDict.update( self.queueDict[queueName]['ParametersDict']) result = ceFactory.getCE(ceName=ce, ceType=ceDict['CEType'], ceParametersDict=ceQueueDict) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = siteFullName self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal(result['Message']) return result if 'BundleProxy' in self.queueDict[queueName][ 'ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: self.queueDict[queueName]['BundleProxy'] = True if siteFullName not in self.sites: self.sites.append(siteFullName) return S_OK() def execute(self): """ Main execution method """ if not self.queueDict: self.log.warn('No site defined, exiting the cycle') return S_OK() result = self.submitJobs() if not result['OK']: self.log.error('Errors in the job submission: ', result['Message']) if self.updateStatus: result = self.updatePilotStatus() if not result['OK']: self.log.error('Errors in updating pilot status: ', result['Message']) return S_OK() def submitJobs(self): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup': setup, 'CPUTime': 9999999, 'SubmitPool': self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms(self.platforms) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose('Checking overall TQ availability with requirements') self.log.verbose(tqDict) rpcMatcher = RPCClient("WorkloadManagement/Matcher") result = rpcMatcher.getMatchingTaskQueues(tqDict) if not result['OK']: return result if not result['Value']: self.log.verbose('No Waiting jobs suitable for the director') return S_OK() queues = self.queueDict.keys() random.shuffle(queues) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite(siteName, 'ComputingAccess') platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict']: queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime']) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose("Getting pilot proxy for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime)) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, cpuTime - 60) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % (queue, result['Message'])) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict['GridCE'] = ceName if not siteMask and 'Site' in ceDict: self.log.info('Site not in the mask %s' % siteName) self.log.info('Removing "Site" from matching Dict') del ceDict['Site'] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms(platform) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues(ceDict) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message']) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info('No matching TQs found') continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min(totalSlots, totalTQJobs) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime) if not result['OK']: self.log.error('Failed to get Number of Waiting pilots', result['Message']) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots) pilotsToSubmit = max( 0, min(totalSlots, totalTQJobs - totalWaitingPilots)) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min(self.maxPilotsToSubmit, pilotsToSubmit) while pilotsToSubmit > 0: self.log.info('Going to submit %d pilots to %s queue' % (pilotsToSubmit, queue)) bundleProxy = self.queueDict[queue].get('BundleProxy', False) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get('JobExecDir', jobExecDir) httpProxy = self.queueDict[queue].get('HttpProxy', '') result = self.__getExecutable(queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob(executable, '', pilotSubmissionChunk) os.unlink(executable) if not result['OK']: self.log.error('Failed submission to queue %s:\n' % queue, result['Message']) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info('Submitted %d pilots to %s@%s' % (len(pilotList), queueName, ceName)) stampDict = {} if result.has_key('PilotStampDict'): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append((tq, sumPriority)) rndm = random.random() * sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random() * sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key(tqID): tqDict[tqID] = [] tqDict[tqID].append(pilotID) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message']) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName) if not result['OK']: self.log.error('Failed to set pilot status: ', result['Message']) continue return S_OK() ##################################################################################### def __getExecutable(self, queue, pilotsToSubmit, bundleProxy=True, httpProxy='', jobExecDir=''): """ Prepare the full executable for queue """ proxy = None if bundleProxy: proxy = self.proxy pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit) if pilotOptions is None: return S_ERROR('Errors in compiling pilot options') executable = self.__writePilotScript(self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir) return S_OK([executable, pilotsToSubmit]) ##################################################################################### def __getPilotOptions(self, queue, pilotsToSubmit): """ Prepare pilot options """ queueDict = self.queueDict[queue]['ParametersDict'] pilotOptions = [] setup = gConfig.getValue("/DIRAC/Setup", "unknown") if setup == 'unknown': self.log.error('Setup is not defined in the configuration') return [None, None] pilotOptions.append('-S %s' % setup) opsHelper = Operations.Operations(group=self.pilotGroup, setup=setup) #Installation defined? installationName = opsHelper.getValue("Pilot/Installation", "") if installationName: pilotOptions.append('-V %s' % installationName) #Project defined? projectName = opsHelper.getValue("Pilot/Project", "") if projectName: pilotOptions.append('-l %s' % projectName) else: self.log.info('DIRAC project will be installed by pilots') #Request a release diracVersion = opsHelper.getValue("Pilot/Version", []) if not diracVersion: self.log.error('Pilot/Version is not defined in the configuration') return [None, None] #diracVersion is a list of accepted releases. Just take the first one pilotOptions.append('-r %s' % diracVersion[0]) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode) if not result['OK']: self.log.error('Invalid proxy token request', result['Message']) return [None, None] (token, numberOfUses) = result['Value'] pilotOptions.append('-o /Security/ProxyToken=%s' % token) # Use Filling mode pilotOptions.append('-M %s' % min(numberOfUses, self.maxJobsInFillMode)) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min(numberOfUses, self.maxJobsInFillMode) if newPilotsToSubmit != pilotsToSubmit: self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == 'debug': pilotOptions.append('-d') # CS Servers csServers = gConfig.getValue("/DIRAC/Configuration/Servers", []) pilotOptions.append('-C %s' % ",".join(csServers)) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue("Pilot/Extensions", []) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append('-e %s' % ",".join(extensionsList)) # Requested CPU time pilotOptions.append('-T %s' % queueDict['CPUTime']) # CEName pilotOptions.append('-N %s' % self.queueDict[queue]['CEName']) # SiteName pilotOptions.append('-n %s' % queueDict['Site']) if 'ClientPlatform' in queueDict: pilotOptions.append("-p '%s'" % queueDict['ClientPlatform']) if 'SharedArea' in queueDict: pilotOptions.append("-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea']) if 'SI00' in queueDict: factor = float(queueDict['SI00']) / 250. pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % factor) pilotOptions.append("-o '/LocalSite/CPUNormalizationFactor=%s'" % factor) else: if 'CPUScalingFactor' in queueDict: pilotOptions.append("-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor']) if 'CPUNormalizationFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor']) # Hack if self.defaultSubmitPools: pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools) if self.group: pilotOptions.append('-G %s' % self.group) self.log.verbose("pilotOptions: ", ' '.join(pilotOptions)) return [pilotOptions, pilotsToSubmit] ##################################################################################### def __writePilotScript(self, workingDirectory, pilotOptions, proxy=None, httpProxy='', pilotExecDir=''): """ Bundle together and write out the pilot executable script, admixt the proxy if given """ try: compressedAndEncodedProxy = '' proxyFlag = 'False' if proxy is not None: compressedAndEncodedProxy = base64.encodestring( bz2.compress(proxy.dumpAllToString()['Value'])) proxyFlag = 'True' compressedAndEncodedPilot = base64.encodestring( bz2.compress(open(self.pilot, "rb").read(), 9)) compressedAndEncodedInstall = base64.encodestring( bz2.compress(open(self.install, "rb").read(), 9)) except: self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory ) os.chdir( pilotWorkingDirectory ) if %(proxyFlag)s: open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) ) os.chmod("proxy",0600) os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) ) os.chmod("%(pilotScript)s",0700) os.chmod("%(installScript)s",0700) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotExecDir': pilotExecDir, 'pilotScript': os.path.basename(self.pilot), 'installScript': os.path.basename(self.install), 'pilotOptions': ' '.join(pilotOptions), 'proxyFlag': proxyFlag } fd, name = tempfile.mkstemp(suffix='_pilotwrapper.py', prefix='DIRAC_', dir=workingDirectory) pilotWrapper = os.fdopen(fd, 'w') pilotWrapper.write(localPilot) pilotWrapper.close() return name def updatePilotStatus(self): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'Status': TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup }) if not result['OK']: self.log.error('Failed to select pilots: %s' % result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append(pRef + ":::" + pilotDict[pRef]['PilotStamp']) else: stampedPilotRefs = list(pilotRefs) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy(self.proxy, 500) result = ce.getJobStatus(stampedPilotRefs) if not result['OK']: self.log.error('Failed to get pilots status from CE', '%s: %s' % (ceName, result['Message'])) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown': # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info('Updating status to %s for pilot %s' % (newStatus, pRef)) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector') # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower( ) == 'false' and self.getOutput: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message']) else: self.log.warn( 'Empty pilot output not stored to PilotDB') # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid(120): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000) if not result['OK']: return result ce.setProxy(self.proxy, 940) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'OutputReady': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info('Retrieving output for pilot %s' % pRef) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput(pRefStamp) if not result['OK']: self.log.error('Failed to get pilot output', '%s: %s' % (ceName, result['Message'])) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error) if not result['OK']: self.log.error('Failed to store pilot output', result['Message']) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots({ 'DestinationSite': ceName, 'Queue': queueName, 'GridType': ceType, 'GridSite': siteName, 'AccountingSent': 'False', 'Status': FINAL_PILOT_STATUS }) if not result['OK']: self.log.error('Failed to select pilots', result['Message']) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo(pilotRefs) if not result['OK']: self.log.error('Failed to get pilots info from DB', result['Message']) continue pilotDict = result['Value'] result = self.sendPilotAccounting(pilotDict) if not result['OK']: self.log.error('Failed to send pilot agent accounting') return S_OK() def sendPilotAccounting(self, pilotDict): """ Send pilot accounting record """ for pRef in pilotDict: self.log.verbose('Preparing accounting record for pilot %s' % pRef) pA = PilotAccounting() pA.setEndTime(pilotDict[pRef]['LastUpdateTime']) pA.setStartTime(pilotDict[pRef]['SubmissionTime']) retVal = CS.getUsernameForDN(pilotDict[pRef]['OwnerDN']) if not retVal['OK']: userName = '******' self.log.error("Can't determine username for dn:", pilotDict[pRef]['OwnerDN']) else: userName = retVal['Value'] pA.setValueByKey('User', userName) pA.setValueByKey('UserGroup', pilotDict[pRef]['OwnerGroup']) result = getSiteForCE(pilotDict[pRef]['DestinationSite']) if result['OK'] and result['Value'].strip(): pA.setValueByKey('Site', result['Value'].strip()) else: pA.setValueByKey('Site', 'Unknown') pA.setValueByKey('GridCE', pilotDict[pRef]['DestinationSite']) pA.setValueByKey('GridMiddleware', pilotDict[pRef]['GridType']) pA.setValueByKey('GridResourceBroker', pilotDict[pRef]['Broker']) pA.setValueByKey('GridStatus', pilotDict[pRef]['Status']) if not 'Jobs' in pilotDict[pRef]: pA.setValueByKey('Jobs', 0) else: pA.setValueByKey('Jobs', len(pilotDict[pRef]['Jobs'])) self.log.info("Adding accounting record for pilot %s" % pilotDict[pRef]['PilotID']) retVal = gDataStoreClient.addRegister(pA) if not retVal['OK']: self.log.error('Failed to send accounting info for pilot ', pRef) else: # Set up AccountingSent flag result = pilotAgentsDB.setAccountingFlag(pRef) if not result['OK']: self.log.error('Failed to set accounting flag for pilot ', pRef) self.log.info('Committing accounting records for %d pilots' % len(pilotDict)) result = gDataStoreClient.commit() if result['OK']: for pRef in pilotDict: self.log.verbose('Setting AccountingSent flag for pilot %s' % pRef) result = pilotAgentsDB.setAccountingFlag(pRef) if not result['OK']: self.log.error('Failed to set accounting flag for pilot ', pRef) else: return result return S_OK()
class ResourceStatus( ElementStatus ): """ ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps the connection to the db / server as an object member, to avoid creating a new one massively. """ __metaclass__ = DIRACSingleton def __init__( self ): """ Constructor, initializes the logger, rssClient and caches. examples >>> resourceStatus = ResourceStatus() """ super( ResourceStatus, self ).__init__() self.siteStatus = SiteStatus() # We can set CacheLifetime and CacheHistory from CS, so that we can tune them. cacheLifeTime = int( RssConfiguration().getConfigCache() ) # RSSCaches, one per elementType ( StorageElement, ComputingElement ) # Should be generated on the fly, instead of being hardcoded ? self.seCache = RSSCache( 'Storage', cacheLifeTime, self._updateSECache ) self.ceCache = RSSCache( 'Computing', cacheLifeTime, self._updateCECache ) #............................................................................. # ComputingElement methods def getComputingStatuses( self, ceNames, statusTypes = None ): """ Method that queries the RSSCache for ComputingElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { computingElementA : { statusType1 : status1, statusType2 : status2 }, computingElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples: >>> resourceStatus.getComputingStatuses( 'ce207.cern.ch', None ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getComputingStatuses( 'ce207.cern.ch', 'all' ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingStatuses( None, 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' }, ... } ) :Parameters: **ceNames** - [ None, `string`, `list` ] name(s) of the computing elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.ceCache.match( ceNames, statusTypes ) if not cacheMatch[ 'OK' ]: return cacheMatch cacheMatch = cacheMatch[ 'Value' ] for ceName, ceDict in cacheMatch.items(): if not self.__getSiteAccess( ceName, 'ComputingAccess' )[ 'OK' ]: cacheMatch[ ceName ] = dict( zip( ceDict.keys(), [ 'Banned' ] * len( ceDict ) ) ) return S_OK( cacheMatch ) def getComputingStatus( self, ceName, statusType ): """ Given a ce and a statusType, it returns its status from the cache. examples: >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', 'all' ) S_OK( 'Active' ) >>> resourceStatus.getComputingStatus( 'ce207.cern.ch', None ) S_ERROR( ... ) :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus( 'Computing', ceName, statusType ) def isUsableComputing( self, ceName, statusType ): """ Similar method to getComputingStatus. The difference is the output. Given a ce name, returns a bool if the ce is usable: status is Active or Degraded outputs True anything else outputs False examples: >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' ) True >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'all' ) False # May be banned >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', None ) False >>> resourceStatus.isUsableComputing( 'RubbishCE', 'all' ) False >>> resourceStatus.isUsableComputing( 'ce207.cern.ch', 'RubbishAccess' ) False :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement( 'Computing', ceName, statusType ) def getUsableComputings( self, statusType ): """ For a given statusType, returns all computing elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples: >>> resourceStatus.getUsableComputings( 'all' ) S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] ) >>> resourceStatus.getUsableComputings( None ) S_ERROR( ... ) >>> resourceStatus.getUsableComputings( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements( 'Computing', statusType ) #............................................................................. # StorageElement methods def getStorageStatuses( self, seNames, statusTypes = None ): """ Method that queries the RSSCache for StorageElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { storageElementA : { statusType1 : status1, statusType2 : status2 }, storageElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples: >>> resourceStatus.getStorageStatuses( 'CERN-USER', None ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } } ) >>> resourceStatus.getStorageStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getStorageStatuses( 'CERN-USER', 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageStatuses( None, 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' }, ... } ) :Parameters: **seNames** - [ None, `string`, `list` ] name(s) of the storage elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.seCache.match( seNames, statusTypes ) if not cacheMatch[ 'OK' ]: return cacheMatch cacheMatch = cacheMatch[ 'Value' ] for seName, seDict in cacheMatch.items(): if not self.__getSiteAccess( seName, 'StorageAccess' )[ 'OK' ]: cacheMatch[ seName ] = dict( zip( seDict.keys(), [ 'Banned' ] * len( seDict ) ) ) return S_OK( cacheMatch ) def getStorageStatus( self, seName, statusType ): """ Given a se and a statusType, it returns its status from the cache. examples: >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' ) S_OK( 'Active' ) >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None ) S_ERROR( ... ) :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus( 'Storage', seName, statusType ) def isUsableStorage( self, seName, statusType ): """ Similar method to getStorageStatus. The difference is the output. Given a se name, returns a bool if the se is usable: status is Active or Degraded outputs True anything else outputs False examples: >>> resourceStatus.isUsableStorage( 'CERN-USER', 'ReadAccess' ) True >>> resourceStatus.isUsableStorage( 'CERN-ARCHIVE', 'ReadAccess' ) False # May be banned >>> resourceStatus.isUsableStorage( 'CERN-USER', None ) False >>> resourceStatus.isUsableStorage( 'RubbishCE', 'ReadAccess' ) False >>> resourceStatus.isUsableStorage( 'CERN-USER', 'RubbishAccess' ) False :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement( 'Storage', seName, statusType ) def getUsableStorages( self, statusType ): """ For a given statusType, returns all storage elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples: >>> resourceStatus.getUsableStorages( 'ReadAccess' ) S_OK( [ 'CERN-USER', 'PIC-USER',... ] ) >>> resourceStatus.getUsableStorages( None ) S_ERROR( ... ) >>> resourceStatus.getUsableStorages( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements( 'Storage', statusType ) #............................................................................. # update Cache methods def _updateCECache( self ): """ Method used to update the ComputingElementCache. """ return self.__updateCache( 'Computing' ) def _updateSECache( self ): """ Method used to update the StorageElementCache. """ return self.__updateCache( 'Storage' ) #............................................................................. # Private methods def __updateCache( self, elementType ): meta = { 'columns' : [ 'Name', 'StatusType', 'Status' ] } rawCache = self.rssClient.selectStatusElement( 'Resource', 'Status', elementType = elementType, meta = meta ) if not rawCache[ 'OK' ]: return rawCache return S_OK( self.getCacheDictFromRawData( rawCache[ 'Value' ] ) ) def __getSiteAccess( self, elementName, siteAccess ): """ Method that given a resourceType and an elementName, finds the site name that owes it. Once that is done, the site access <siteAccess> is checked and returned. :Parameters: **resourceType** - `string` name of the resource type ( StorageElement, ComputingElement.. ) **elementName** - `string` name of the resource of type <resourceType> **siteAccess** - `string` site access ( StorageAccess, ComputingAccess .. ) :return: S_OK() || S_ERROR() """ siteName = Resources.getSiteForResource( elementName ) if not siteName[ 'OK' ]: return siteName siteName = siteName[ 'Value' ] if not self.siteStatus.isUsableSite( siteName, siteAccess ): return S_ERROR( 'Site %s is not usable for Computing' % siteName ) return S_OK() ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
class SiteDirector( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize( self ): """ Standard constructor """ self.am_setOption( "PollingTime", 60.0 ) self.am_setOption( "maxPilotWaitingHours", 6 ) self.queueDict = {} self.maxJobsInFillMode = MAX_JOBS_IN_FILLMODE self.maxPilotsToSubmit = MAX_PILOTS_TO_SUBMIT self.siteStatus = SiteStatus() return S_OK() def beginExecution( self ): self.gridEnv = self.am_getOption( "GridEnv", getGridEnv() ) # The SiteDirector is for a particular user community self.vo = self.am_getOption( "Community", '' ) if not self.vo: self.vo = CSGlobals.getVO() # The SiteDirector is for a particular user group self.group = self.am_getOption( "Group", '' ) # self.voGroups contain all the eligible user groups for pilots submutted by this SiteDirector self.voGroups = [] # Choose the group for which pilots will be submitted. This is a hack until # we will be able to match pilots to VOs. if not self.group: if self.vo: result = Registry.getGroupsForVO( self.vo ) if not result['OK']: return result for group in result['Value']: if 'NormalUser' in Registry.getPropertiesForGroup( group ): self.voGroups.append( group ) else: self.voGroups = [ self.group ] result = findGenericPilotCredentials( vo = self.vo ) if not result[ 'OK' ]: return result self.pilotDN, self.pilotGroup = result[ 'Value' ] self.pilotDN = self.am_getOption( "PilotDN", self.pilotDN ) self.pilotGroup = self.am_getOption( "PilotGroup", self.pilotGroup ) self.platforms = [] self.sites = [] self.defaultSubmitPools = '' if self.group: self.defaultSubmitPools = Registry.getGroupOption( self.group, 'SubmitPools', '' ) elif self.vo: self.defaultSubmitPools = Registry.getVOOption( self.vo, 'SubmitPools', '' ) self.pilot = self.am_getOption( 'PilotScript', DIRAC_PILOT ) self.install = DIRAC_INSTALL self.workingDirectory = self.am_getOption( 'WorkDirectory' ) self.maxQueueLength = self.am_getOption( 'MaxQueueLength', 86400 * 3 ) self.pilotLogLevel = self.am_getOption( 'PilotLogLevel', 'INFO' ) self.maxJobsInFillMode = self.am_getOption( 'MaxJobsInFillMode', self.maxJobsInFillMode ) self.maxPilotsToSubmit = self.am_getOption( 'MaxPilotsToSubmit', self.maxPilotsToSubmit ) self.pilotWaitingFlag = self.am_getOption( 'PilotWaitingFlag', True ) self.pilotWaitingTime = self.am_getOption( 'MaxPilotWaitingTime', 7200 ) # Flags self.updateStatus = self.am_getOption( 'UpdatePilotStatus', True ) self.getOutput = self.am_getOption( 'GetPilotOutput', True ) self.sendAccounting = self.am_getOption( 'SendPilotAccounting', True ) # Get the site description dictionary siteNames = None if not self.am_getOption( 'Site', 'Any' ).lower() == "any": siteNames = self.am_getOption( 'Site', [] ) ceTypes = None if not self.am_getOption( 'CETypes', 'Any' ).lower() == "any": ceTypes = self.am_getOption( 'CETypes', [] ) ces = None if not self.am_getOption( 'CEs', 'Any' ).lower() == "any": ces = self.am_getOption( 'CEs', [] ) self._resources = Resources.Resources( vo = self.vo ) result = self._resources.getEligibleQueuesInfo( siteList = siteNames, ceList = ces, ceTypeList = ceTypes, mode = 'Direct' ) if not result['OK']: return result resourceDict = result['Value'] result = self.getQueues( resourceDict ) if not result['OK']: return result #if not siteNames: # siteName = gConfig.getValue( '/DIRAC/Site', 'Unknown' ) # if siteName == 'Unknown': # return S_OK( 'No site specified for the SiteDirector' ) # else: # siteNames = [siteName] #self.siteNames = siteNames if self.updateStatus: self.log.always( 'Pilot status update requested' ) if self.getOutput: self.log.always( 'Pilot output retrieval requested' ) if self.sendAccounting: self.log.always( 'Pilot accounting sending requested' ) self.log.always( 'Sites:', siteNames ) self.log.always( 'CETypes:', ceTypes ) self.log.always( 'CEs:', ces ) self.log.always( 'PilotDN:', self.pilotDN ) self.log.always( 'PilotGroup:', self.pilotGroup ) self.log.always( 'MaxPilotsToSubmit:', self.maxPilotsToSubmit ) self.log.always( 'MaxJobsInFillMode:', self.maxJobsInFillMode ) self.localhost = socket.getfqdn() self.proxy = '' if self.queueDict: self.log.always( "Agent will serve queues:" ) for queue in self.queueDict: self.log.always( "Site: %s, CE: %s, Queue: %s" % ( self.queueDict[queue]['Site'], self.queueDict[queue]['CEName'], queue ) ) return S_OK() def getQueues( self, resourceDict ): """ Get the list of relevant CEs and their descriptions """ self.queueDict = {} ceFactory = ComputingElementFactory() for site in resourceDict: result = self._resources.getSiteFullName( site ) if not result['OK']: continue siteFullName = result['Value'] for ce in resourceDict[site]: ceDict = resourceDict[site][ce] qDict = ceDict.pop( 'Queues' ) for queue in qDict: queueName = '%s_%s' % ( ce, queue ) self.queueDict[queueName] = {} self.queueDict[queueName]['ParametersDict'] = qDict[queue] self.queueDict[queueName]['ParametersDict']['Queue'] = queue self.queueDict[queueName]['ParametersDict']['Site'] = siteFullName self.queueDict[queueName]['ParametersDict']['GridEnv'] = self.gridEnv self.queueDict[queueName]['ParametersDict']['Setup'] = gConfig.getValue( '/DIRAC/Setup', 'unknown' ) # Evaluate the CPU limit of the queue according to the Glue convention # To Do: should be a utility if "maxCPUTime" in self.queueDict[queueName]['ParametersDict'] and \ "SI00" in self.queueDict[queueName]['ParametersDict']: maxCPUTime = float( self.queueDict[queueName]['ParametersDict']['maxCPUTime'] ) # For some sites there are crazy values in the CS maxCPUTime = max( maxCPUTime, 0 ) maxCPUTime = min( maxCPUTime, 86400 * 12.5 ) si00 = float( self.queueDict[queueName]['ParametersDict']['SI00'] ) queueCPUTime = 60. / 250. * maxCPUTime * si00 self.queueDict[queueName]['ParametersDict']['CPUTime'] = int( queueCPUTime ) qwDir = os.path.join( self.workingDirectory, queue ) if not os.path.exists( qwDir ): os.makedirs( qwDir ) self.queueDict[queueName]['ParametersDict']['WorkingDirectory'] = qwDir platform = '' if "Platform" in self.queueDict[queueName]['ParametersDict']: platform = self.queueDict[queueName]['ParametersDict']['Platform'] elif "Platform" in ceDict: platform = ceDict['Platform'] elif "OS" in ceDict: architecture = ceDict.get( 'architecture', 'x86_64' ) OS = ceDict['OS'] platform = '_'.join( [architecture, OS] ) if platform and not platform in self.platforms: self.platforms.append( platform ) if not "Platform" in self.queueDict[queueName]['ParametersDict'] and platform: result = Resources.getDIRACPlatform( platform ) if result['OK']: self.queueDict[queueName]['ParametersDict']['Platform'] = result['Value'] ceQueueDict = dict( ceDict ) ceQueueDict.update( self.queueDict[queueName]['ParametersDict'] ) result = ceFactory.getCE( ceName = ce, ceType = ceDict['CEType'], ceParametersDict = ceQueueDict ) if not result['OK']: return result self.queueDict[queueName]['CE'] = result['Value'] self.queueDict[queueName]['CEName'] = ce self.queueDict[queueName]['CEType'] = ceDict['CEType'] self.queueDict[queueName]['Site'] = siteFullName self.queueDict[queueName]['QueueName'] = queue self.queueDict[queueName]['Platform'] = platform result = self.queueDict[queueName]['CE'].isValid() if not result['OK']: self.log.fatal( result['Message'] ) return result if 'BundleProxy' in self.queueDict[queueName]['ParametersDict']: self.queueDict[queueName]['BundleProxy'] = True elif 'BundleProxy' in ceDict: self.queueDict[queueName]['BundleProxy'] = True if siteFullName not in self.sites: self.sites.append( siteFullName ) return S_OK() def execute( self ): """ Main execution method """ if not self.queueDict: self.log.warn( 'No site defined, exiting the cycle' ) return S_OK() result = self.submitJobs() if not result['OK']: self.log.error( 'Errors in the job submission: ', result['Message'] ) if self.updateStatus: result = self.updatePilotStatus() if not result['OK']: self.log.error( 'Errors in updating pilot status: ', result['Message'] ) return S_OK() def submitJobs( self ): """ Go through defined computing elements and submit jobs if necessary """ # Check that there is some work at all setup = CSGlobals.getSetup() tqDict = { 'Setup':setup, 'CPUTime': 9999999, 'SubmitPool' : self.defaultSubmitPools } if self.vo: tqDict['Community'] = self.vo if self.voGroups: tqDict['OwnerGroup'] = self.voGroups result = Resources.getCompatiblePlatforms( self.platforms ) if not result['OK']: return result tqDict['Platform'] = result['Value'] tqDict['Site'] = self.sites self.log.verbose( 'Checking overall TQ availability with requirements' ) self.log.verbose( tqDict ) rpcMatcher = RPCClient( "WorkloadManagement/Matcher" ) result = rpcMatcher.getMatchingTaskQueues( tqDict ) if not result[ 'OK' ]: return result if not result['Value']: self.log.verbose( 'No Waiting jobs suitable for the director' ) return S_OK() queues = self.queueDict.keys() random.shuffle( queues ) for queue in queues: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] ceType = self.queueDict[queue]['CEType'] queueName = self.queueDict[queue]['QueueName'] siteName = self.queueDict[queue]['Site'] siteMask = self.siteStatus.isUsableSite( siteName, 'ComputingAccess' ) platform = self.queueDict[queue]['Platform'] if 'CPUTime' in self.queueDict[queue]['ParametersDict'] : queueCPUTime = int( self.queueDict[queue]['ParametersDict']['CPUTime'] ) else: self.log.warn( 'CPU time limit is not specified for queue %s, skipping...' % queue ) continue if queueCPUTime > self.maxQueueLength: queueCPUTime = self.maxQueueLength # Get the working proxy cpuTime = queueCPUTime + 86400 self.log.verbose( "Getting pilot proxy for %s/%s %d long" % ( self.pilotDN, self.pilotGroup, cpuTime ) ) result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, cpuTime ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, cpuTime - 60 ) # Get the number of available slots on the target site/queue result = ce.available() if not result['OK']: self.log.warn( 'Failed to check the availability of queue %s: \n%s' % ( queue, result['Message'] ) ) continue ceInfoDict = result['CEInfoDict'] self.log.info( "CE queue report(%s_%s): Wait=%d, Run=%d, Submitted=%d, Max=%d" % \ ( ceName, queueName, ceInfoDict['WaitingJobs'], ceInfoDict['RunningJobs'], ceInfoDict['SubmittedJobs'], ceInfoDict['MaxTotalJobs'] ) ) totalSlots = result['Value'] ceDict = ce.getParameterDict() ceDict[ 'GridCE' ] = ceName if not siteMask and 'Site' in ceDict: self.log.info( 'Site not in the mask %s' % siteName ) self.log.info( 'Removing "Site" from matching Dict' ) del ceDict[ 'Site' ] if self.vo: ceDict['Community'] = self.vo if self.voGroups: ceDict['OwnerGroup'] = self.voGroups # This is a hack to get rid of ! ceDict['SubmitPool'] = self.defaultSubmitPools result = Resources.getCompatiblePlatforms( platform ) if not result['OK']: continue ceDict['Platform'] = result['Value'] # Get the number of eligible jobs for the target site/queue result = rpcMatcher.getMatchingTaskQueues( ceDict ) if not result['OK']: self.log.error( 'Could not retrieve TaskQueues from TaskQueueDB', result['Message'] ) return result taskQueueDict = result['Value'] if not taskQueueDict: self.log.info( 'No matching TQs found' ) continue totalTQJobs = 0 tqIDList = taskQueueDict.keys() for tq in taskQueueDict: totalTQJobs += taskQueueDict[tq]['Jobs'] pilotsToSubmit = min( totalSlots, totalTQJobs ) # Get the number of already waiting pilots for this queue totalWaitingPilots = 0 if self.pilotWaitingFlag: lastUpdateTime = dateTime() - self.pilotWaitingTime * second result = pilotAgentsDB.countPilots( { 'TaskQueueID': tqIDList, 'Status': WAITING_PILOT_STATUS }, None, lastUpdateTime ) if not result['OK']: self.log.error( 'Failed to get Number of Waiting pilots', result['Message'] ) totalWaitingPilots = 0 else: totalWaitingPilots = result['Value'] self.log.verbose( 'Waiting Pilots for TaskQueue %s:' % tqIDList, totalWaitingPilots ) pilotsToSubmit = max( 0, min( totalSlots, totalTQJobs - totalWaitingPilots ) ) self.log.info( 'Available slots=%d, TQ jobs=%d, Waiting Pilots=%d, Pilots to submit=%d' % \ ( totalSlots, totalTQJobs, totalWaitingPilots, pilotsToSubmit ) ) # Limit the number of pilots to submit to MAX_PILOTS_TO_SUBMIT pilotsToSubmit = min( self.maxPilotsToSubmit, pilotsToSubmit ) while pilotsToSubmit > 0: self.log.info( 'Going to submit %d pilots to %s queue' % ( pilotsToSubmit, queue ) ) bundleProxy = self.queueDict[queue].get( 'BundleProxy', False ) jobExecDir = '' if ceType == 'CREAM': jobExecDir = '.' jobExecDir = self.queueDict[queue].get( 'JobExecDir', jobExecDir ) httpProxy = self.queueDict[queue].get( 'HttpProxy', '' ) result = self.__getExecutable( queue, pilotsToSubmit, bundleProxy, httpProxy, jobExecDir ) if not result['OK']: return result executable, pilotSubmissionChunk = result['Value'] result = ce.submitJob( executable, '', pilotSubmissionChunk ) os.unlink( executable ) if not result['OK']: self.log.error( 'Failed submission to queue %s:\n' % queue, result['Message'] ) pilotsToSubmit = 0 continue pilotsToSubmit = pilotsToSubmit - pilotSubmissionChunk # Add pilots to the PilotAgentsDB assign pilots to TaskQueue proportionally to the # task queue priorities pilotList = result['Value'] self.log.info( 'Submitted %d pilots to %s@%s' % ( len( pilotList ), queueName, ceName ) ) stampDict = {} if result.has_key( 'PilotStampDict' ): stampDict = result['PilotStampDict'] tqPriorityList = [] sumPriority = 0. for tq in taskQueueDict: sumPriority += taskQueueDict[tq]['Priority'] tqPriorityList.append( ( tq, sumPriority ) ) rndm = random.random()*sumPriority tqDict = {} for pilotID in pilotList: rndm = random.random()*sumPriority for tq, prio in tqPriorityList: if rndm < prio: tqID = tq break if not tqDict.has_key( tqID ): tqDict[tqID] = [] tqDict[tqID].append( pilotID ) for tqID, pilotList in tqDict.items(): result = pilotAgentsDB.addPilotTQReference( pilotList, tqID, self.pilotDN, self.pilotGroup, self.localhost, ceType, '', stampDict ) if not result['OK']: self.log.error( 'Failed add pilots to the PilotAgentsDB: ', result['Message'] ) continue for pilot in pilotList: result = pilotAgentsDB.setPilotStatus( pilot, 'Submitted', ceName, 'Successfully submitted by the SiteDirector', siteName, queueName ) if not result['OK']: self.log.error( 'Failed to set pilot status: ', result['Message'] ) continue return S_OK() ##################################################################################### def __getExecutable( self, queue, pilotsToSubmit, bundleProxy = True, httpProxy = '', jobExecDir = '' ): """ Prepare the full executable for queue """ proxy = None if bundleProxy: proxy = self.proxy pilotOptions, pilotsToSubmit = self.__getPilotOptions( queue, pilotsToSubmit ) if pilotOptions is None: return S_ERROR( 'Errors in compiling pilot options' ) executable = self.__writePilotScript( self.workingDirectory, pilotOptions, proxy, httpProxy, jobExecDir ) return S_OK( [ executable, pilotsToSubmit ] ) ##################################################################################### def __getPilotOptions( self, queue, pilotsToSubmit ): """ Prepare pilot options """ queueDict = self.queueDict[queue]['ParametersDict'] pilotOptions = [] setup = gConfig.getValue( "/DIRAC/Setup", "unknown" ) if setup == 'unknown': self.log.error( 'Setup is not defined in the configuration' ) return [ None, None ] pilotOptions.append( '-S %s' % setup ) opsHelper = Operations.Operations( group = self.pilotGroup, setup = setup ) #Installation defined? installationName = opsHelper.getValue( "Pilot/Installation", "" ) if installationName: pilotOptions.append( '-V %s' % installationName ) #Project defined? projectName = opsHelper.getValue( "Pilot/Project", "" ) if projectName: pilotOptions.append( '-l %s' % projectName ) else: self.log.info( 'DIRAC project will be installed by pilots' ) #Request a release diracVersion = opsHelper.getValue( "Pilot/Version", [] ) if not diracVersion: self.log.error( 'Pilot/Version is not defined in the configuration' ) return [ None, None ] #diracVersion is a list of accepted releases. Just take the first one pilotOptions.append( '-r %s' % diracVersion[0] ) ownerDN = self.pilotDN ownerGroup = self.pilotGroup # Request token for maximum pilot efficiency result = gProxyManager.requestToken( ownerDN, ownerGroup, pilotsToSubmit * self.maxJobsInFillMode ) if not result[ 'OK' ]: self.log.error( 'Invalid proxy token request', result['Message'] ) return [ None, None ] ( token, numberOfUses ) = result[ 'Value' ] pilotOptions.append( '-o /Security/ProxyToken=%s' % token ) # Use Filling mode pilotOptions.append( '-M %s' % min( numberOfUses, self.maxJobsInFillMode ) ) # Since each pilot will execute min( numberOfUses, self.maxJobsInFillMode ) # with numberOfUses tokens we can submit at most: # numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) # pilots newPilotsToSubmit = numberOfUses / min( numberOfUses, self.maxJobsInFillMode ) if newPilotsToSubmit != pilotsToSubmit: self.log.info( 'Number of pilots to submit is changed to %d after getting the proxy token' % newPilotsToSubmit ) pilotsToSubmit = newPilotsToSubmit # Debug if self.pilotLogLevel.lower() == 'debug': pilotOptions.append( '-d' ) # CS Servers csServers = gConfig.getValue( "/DIRAC/Configuration/Servers", [] ) pilotOptions.append( '-C %s' % ",".join( csServers ) ) # DIRAC Extensions to be used in pilots pilotExtensionsList = opsHelper.getValue( "Pilot/Extensions", [] ) extensionsList = [] if pilotExtensionsList: if pilotExtensionsList[0] != 'None': extensionsList = pilotExtensionsList else: extensionsList = CSGlobals.getCSExtensions() if extensionsList: pilotOptions.append( '-e %s' % ",".join( extensionsList ) ) # Requested CPU time pilotOptions.append( '-T %s' % queueDict['CPUTime'] ) # CEName pilotOptions.append( '-N %s' % self.queueDict[queue]['CEName'] ) # SiteName pilotOptions.append( '-n %s' % queueDict['Site'] ) if 'ClientPlatform' in queueDict: pilotOptions.append( "-p '%s'" % queueDict['ClientPlatform'] ) if 'SharedArea' in queueDict: pilotOptions.append( "-o '/LocalSite/SharedArea=%s'" % queueDict['SharedArea'] ) if 'SI00' in queueDict: factor = float( queueDict['SI00'] ) / 250. pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % factor ) pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % factor ) else: if 'CPUScalingFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUScalingFactor=%s'" % queueDict['CPUScalingFactor'] ) if 'CPUNormalizationFactor' in queueDict: pilotOptions.append( "-o '/LocalSite/CPUNormalizationFactor=%s'" % queueDict['CPUNormalizationFactor'] ) # Hack if self.defaultSubmitPools: pilotOptions.append( '-o /Resources/Computing/CEDefaults/SubmitPool=%s' % self.defaultSubmitPools ) if self.group: pilotOptions.append( '-G %s' % self.group ) self.log.verbose( "pilotOptions: ", ' '.join( pilotOptions ) ) return [ pilotOptions, pilotsToSubmit ] ##################################################################################### def __writePilotScript( self, workingDirectory, pilotOptions, proxy = None, httpProxy = '', pilotExecDir = '' ): """ Bundle together and write out the pilot executable script, admixt the proxy if given """ try: compressedAndEncodedProxy = '' proxyFlag = 'False' if proxy is not None: compressedAndEncodedProxy = base64.encodestring( bz2.compress( proxy.dumpAllToString()['Value'] ) ) proxyFlag = 'True' compressedAndEncodedPilot = base64.encodestring( bz2.compress( open( self.pilot, "rb" ).read(), 9 ) ) compressedAndEncodedInstall = base64.encodestring( bz2.compress( open( self.install, "rb" ).read(), 9 ) ) except: self.log.exception( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) return S_ERROR( 'Exception during file compression of proxy, dirac-pilot or dirac-install' ) localPilot = """#!/bin/bash /usr/bin/env python << EOF # import os, tempfile, sys, shutil, base64, bz2 try: pilotExecDir = '%(pilotExecDir)s' if not pilotExecDir: pilotExecDir = None pilotWorkingDirectory = tempfile.mkdtemp( suffix = 'pilot', prefix = 'DIRAC_', dir = pilotExecDir ) pilotWorkingDirectory = os.path.realpath( pilotWorkingDirectory ) os.chdir( pilotWorkingDirectory ) if %(proxyFlag)s: open( 'proxy', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedProxy)s\"\"\" ) ) ) os.chmod("proxy",0600) os.environ["X509_USER_PROXY"]=os.path.join(pilotWorkingDirectory, 'proxy') open( '%(pilotScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedPilot)s\"\"\" ) ) ) open( '%(installScript)s', "w" ).write(bz2.decompress( base64.decodestring( \"\"\"%(compressedAndEncodedInstall)s\"\"\" ) ) ) os.chmod("%(pilotScript)s",0700) os.chmod("%(installScript)s",0700) if "LD_LIBRARY_PATH" not in os.environ: os.environ["LD_LIBRARY_PATH"]="" if "%(httpProxy)s": os.environ["HTTP_PROXY"]="%(httpProxy)s" os.environ["X509_CERT_DIR"]=os.path.join(pilotWorkingDirectory, 'etc/grid-security/certificates') # TODO: structure the output print '===========================================================' print 'Environment of execution host' for key in os.environ.keys(): print key + '=' + os.environ[key] print '===========================================================' except Exception, x: print >> sys.stderr, x sys.exit(-1) cmd = "python %(pilotScript)s %(pilotOptions)s" print 'Executing: ', cmd sys.stdout.flush() os.system( cmd ) shutil.rmtree( pilotWorkingDirectory ) EOF """ % { 'compressedAndEncodedProxy': compressedAndEncodedProxy, 'compressedAndEncodedPilot': compressedAndEncodedPilot, 'compressedAndEncodedInstall': compressedAndEncodedInstall, 'httpProxy': httpProxy, 'pilotExecDir': pilotExecDir, 'pilotScript': os.path.basename( self.pilot ), 'installScript': os.path.basename( self.install ), 'pilotOptions': ' '.join( pilotOptions ), 'proxyFlag': proxyFlag } fd, name = tempfile.mkstemp( suffix = '_pilotwrapper.py', prefix = 'DIRAC_', dir = workingDirectory ) pilotWrapper = os.fdopen( fd, 'w' ) pilotWrapper.write( localPilot ) pilotWrapper.close() return name def updatePilotStatus( self ): """ Update status of pilots in transient states """ for queue in self.queueDict: ce = self.queueDict[queue]['CE'] ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'Status':TRANSIENT_PILOT_STATUS, 'OwnerDN': self.pilotDN, 'OwnerGroup': self.pilotGroup } ) if not result['OK']: self.log.error( 'Failed to select pilots: %s' % result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] stampedPilotRefs = [] for pRef in pilotDict: if pilotDict[pRef]['PilotStamp']: stampedPilotRefs.append( pRef + ":::" + pilotDict[pRef]['PilotStamp'] ) else: stampedPilotRefs = list( pilotRefs ) break result = ce.isProxyValid() if not result['OK']: result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 600 ) if not result['OK']: return result self.proxy = result['Value'] ce.setProxy( self.proxy, 500 ) result = ce.getJobStatus( stampedPilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots status from CE', '%s: %s' % ( ceName, result['Message'] ) ) continue pilotCEDict = result['Value'] for pRef in pilotRefs: newStatus = '' oldStatus = pilotDict[pRef]['Status'] ceStatus = pilotCEDict[pRef] if oldStatus == ceStatus: # Status did not change, continue continue elif ceStatus == "Unknown" and not oldStatus in FINAL_PILOT_STATUS: # Pilot finished without reporting, consider it Aborted newStatus = 'Aborted' elif ceStatus != 'Unknown' : # Update the pilot status to the new value newStatus = ceStatus if newStatus: self.log.info( 'Updating status to %s for pilot %s' % ( newStatus, pRef ) ) result = pilotAgentsDB.setPilotStatus( pRef, newStatus, '', 'Updated by SiteDirector' ) # Retrieve the pilot output now if newStatus in FINAL_PILOT_STATUS: if pilotDict[pRef]['OutputReady'].lower() == 'false' and self.getOutput: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] if output: result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) else: self.log.warn( 'Empty pilot output not stored to PilotDB' ) # The pilot can be in Done state set by the job agent check if the output is retrieved for queue in self.queueDict: ce = self.queueDict[queue]['CE'] if not ce.isProxyValid( 120 ): result = gProxyManager.getPilotProxyFromDIRACGroup( self.pilotDN, self.pilotGroup, 1000 ) if not result['OK']: return result ce.setProxy( self.proxy, 940 ) ceName = self.queueDict[queue]['CEName'] queueName = self.queueDict[queue]['QueueName'] ceType = self.queueDict[queue]['CEType'] siteName = self.queueDict[queue]['Site'] result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'OutputReady':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] if self.getOutput: for pRef in pilotRefs: self.log.info( 'Retrieving output for pilot %s' % pRef ) pilotStamp = pilotDict[pRef]['PilotStamp'] pRefStamp = pRef if pilotStamp: pRefStamp = pRef + ':::' + pilotStamp result = ce.getJobOutput( pRefStamp ) if not result['OK']: self.log.error( 'Failed to get pilot output', '%s: %s' % ( ceName, result['Message'] ) ) else: output, error = result['Value'] result = pilotAgentsDB.storePilotOutput( pRef, output, error ) if not result['OK']: self.log.error( 'Failed to store pilot output', result['Message'] ) # Check if the accounting is to be sent if self.sendAccounting: result = pilotAgentsDB.selectPilots( {'DestinationSite':ceName, 'Queue':queueName, 'GridType':ceType, 'GridSite':siteName, 'AccountingSent':'False', 'Status':FINAL_PILOT_STATUS} ) if not result['OK']: self.log.error( 'Failed to select pilots', result['Message'] ) continue pilotRefs = result['Value'] if not pilotRefs: continue result = pilotAgentsDB.getPilotInfo( pilotRefs ) if not result['OK']: self.log.error( 'Failed to get pilots info from DB', result['Message'] ) continue pilotDict = result['Value'] result = self.sendPilotAccounting( pilotDict ) if not result['OK']: self.log.error( 'Failed to send pilot agent accounting' ) return S_OK() def sendPilotAccounting( self, pilotDict ): """ Send pilot accounting record """ for pRef in pilotDict: self.log.verbose( 'Preparing accounting record for pilot %s' % pRef ) pA = PilotAccounting() pA.setEndTime( pilotDict[pRef][ 'LastUpdateTime' ] ) pA.setStartTime( pilotDict[pRef][ 'SubmissionTime' ] ) retVal = CS.getUsernameForDN( pilotDict[pRef][ 'OwnerDN' ] ) if not retVal[ 'OK' ]: userName = '******' self.log.error( "Can't determine username for dn:", pilotDict[pRef][ 'OwnerDN' ] ) else: userName = retVal[ 'Value' ] pA.setValueByKey( 'User', userName ) pA.setValueByKey( 'UserGroup', pilotDict[pRef][ 'OwnerGroup' ] ) result = getSiteForCE( pilotDict[pRef][ 'DestinationSite' ] ) if result['OK'] and result[ 'Value' ].strip(): pA.setValueByKey( 'Site', result['Value'].strip() ) else: pA.setValueByKey( 'Site', 'Unknown' ) pA.setValueByKey( 'GridCE', pilotDict[pRef][ 'DestinationSite' ] ) pA.setValueByKey( 'GridMiddleware', pilotDict[pRef][ 'GridType' ] ) pA.setValueByKey( 'GridResourceBroker', pilotDict[pRef][ 'Broker' ] ) pA.setValueByKey( 'GridStatus', pilotDict[pRef][ 'Status' ] ) if not 'Jobs' in pilotDict[pRef]: pA.setValueByKey( 'Jobs', 0 ) else: pA.setValueByKey( 'Jobs', len( pilotDict[pRef]['Jobs'] ) ) self.log.info( "Adding accounting record for pilot %s" % pilotDict[pRef][ 'PilotID' ] ) retVal = gDataStoreClient.addRegister( pA ) if not retVal[ 'OK' ]: self.log.error( 'Failed to send accounting info for pilot ', pRef ) else: # Set up AccountingSent flag result = pilotAgentsDB.setAccountingFlag( pRef ) if not result['OK']: self.log.error( 'Failed to set accounting flag for pilot ', pRef ) self.log.info( 'Committing accounting records for %d pilots' % len( pilotDict ) ) result = gDataStoreClient.commit() if result['OK']: for pRef in pilotDict: self.log.verbose( 'Setting AccountingSent flag for pilot %s' % pRef ) result = pilotAgentsDB.setAccountingFlag( pRef ) if not result['OK']: self.log.error( 'Failed to set accounting flag for pilot ', pRef ) else: return result return S_OK()
class ResourceStatus(ElementStatus): """ ResourceStatus helper that connects to CS if RSS flag is not Active. It keeps the connection to the db / server as an object member, to avoid creating a new one massively. """ __metaclass__ = DIRACSingleton def __init__(self): """ Constructor, initializes the logger, rssClient and caches. examples >>> resourceStatus = ResourceStatus() """ super(ResourceStatus, self).__init__() self.siteStatus = SiteStatus() # We can set CacheLifetime and CacheHistory from CS, so that we can tune them. cacheLifeTime = int(RssConfiguration().getConfigCache()) # RSSCaches, one per elementType ( StorageElement, ComputingElement ) # Should be generated on the fly, instead of being hardcoded ? self.seCache = RSSCache('StorageElement', cacheLifeTime, self._updateSECache) self.ceCache = RSSCache('ComputingElement', cacheLifeTime, self._updateCECache) #............................................................................. # ComputingElement methods def getComputingElementStatuses(self, ceNames, statusTypes=None): """ Method that queries the RSSCache for ComputingElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { computingElementA : { statusType1 : status1, statusType2 : status2 }, computingElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples >>> resourceStatus.getComputingElementStatuses( 'ce207.cern.ch', None ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingElementStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getComputingElementStatuses( 'ce207.cern.ch', 'all' ) S_OK( { 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingElementStatuses( [ 'ce206.cern.ch', 'ce207.cern.ch' ], 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' } } ) >>> resourceStatus.getComputingElementStatuses( None, 'all' ) S_OK( { 'ce206.cern.ch' : { 'all' : 'Active' }, 'ce207.cern.ch' : { 'all' : 'Active' }, ... } ) :Parameters: **ceNames** - [ None, `string`, `list` ] name(s) of the computing elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.ceCache.match(ceNames, statusTypes) if not cacheMatch['OK']: return cacheMatch cacheMatch = cacheMatch['Value'] for ceName, ceDict in cacheMatch.items(): if not self.__getSiteAccess('ComputingElement', ceName, 'ComputingAccess')['OK']: cacheMatch[ceName] = dict( zip(ceDict.keys(), ['Banned'] * len(ceDict))) return S_OK(cacheMatch) def getComputingElementStatus(self, ceName, statusType): """ Given a ce and a statusType, it returns its status from the cache. examples >>> resourceStatus.getComputingElementStatus( 'ce207.cern.ch', 'all' ) S_OK( 'Active' ) >>> resourceStatus.getComputingElementStatus( 'ce207.cern.ch', None ) S_ERROR( ... ) :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus('ComputingElement', ceName, statusType) def isUsableComputingElement(self, ceName, statusType): """ Similar method to getComputingElementStatus. The difference is the output. Given a ce name, returns a bool if the ce is usable: status is Active or Degraded outputs True anything else outputs False examples >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'all' ) True >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'all' ) False # May be banned >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', None ) False >>> resourceStatus.isUsableComputingElement( 'RubbishCE', 'all' ) False >>> resourceStatus.isUsableComputingElement( 'ce207.cern.ch', 'RubbishAccess' ) False :Parameters: **ceName** - `string` name of the computing element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement('ComputingElement', ceName, statusType) def getUsableComputingElements(self, statusType): """ For a given statusType, returns all computing elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples >>> resourceStatus.getUsableComputingElements( 'all' ) S_OK( [ 'ce206.cern.ch', 'ce207.cern.ch',... ] ) >>> resourceStatus.getUsableComputingElements( None ) S_ERROR( ... ) >>> resourceStatus.getUsableComputingElements( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements('ComputingElement', statusType) #............................................................................. # StorageElement methods def getStorageElementStatuses(self, seNames, statusTypes=None): """ Method that queries the RSSCache for StorageElement-Status-related information. If any of the inputs is None, it is interpreted as * ( all ). If match is positive, the output looks like: { storageElementA : { statusType1 : status1, statusType2 : status2 }, storageElementB : { statusType1 : status1, statusType2 : status2 }, } There are ALWAYS the same keys inside the site dictionaries. examples >>> resourceStatus.getStorageElementStatuses( 'CERN-USER', None ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active', 'WriteAccess' : 'Degraded',... } } ) >>> resourceStatus.getStorageElementStatuses( 'RubbishCE', None ) S_ERROR( ... ) >>> resourceStaus.getStorageElementStatuses( 'CERN-USER', 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageElementStatuses( [ 'CERN-USER', 'PIC-USER' ], 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' } } ) >>> resourceStatus.getStorageElementStatuses( None, 'ReadAccess' ) S_OK( { 'CERN-USER' : { 'ReadAccess' : 'Active' }, 'PIC-USER' : { 'ReadAccess' : 'Active' }, ... } ) :Parameters: **seNames** - [ None, `string`, `list` ] name(s) of the storage elements to be matched **statusTypes** - [ None, `string`, `list` ] name(s) of the statusTypes to be matched :return: S_OK() || S_ERROR() """ cacheMatch = self.seCache.match(seNames, statusTypes) if not cacheMatch['OK']: return cacheMatch cacheMatch = cacheMatch['Value'] for seName, seDict in cacheMatch.items(): if not self.__getSiteAccess('StorageElement', seName, 'StorageAccess')['OK']: cacheMatch[seName] = dict( zip(seDict.keys(), ['Banned'] * len(seDict))) return S_OK(cacheMatch) def getStorageElementStatus(self, seName, statusType): """ Given a se and a statusType, it returns its status from the cache. examples >>> resourceStatus.getComputingElementStatus( 'CERN-USER', 'ReadAccess' ) S_OK( 'Active' ) >>> resourceStatus.getComputingElementStatus( 'CERN-USER', None ) S_ERROR( ... ) :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getElementStatus('StorageElement', seName, statusType) def isUsableStorageElement(self, seName, statusType): """ Similar method to getStorageElementStatus. The difference is the output. Given a se name, returns a bool if the se is usable: status is Active or Degraded outputs True anything else outputs False examples >>> resourceStatus.isUsableStorageElement( 'CERN-USER', 'ReadAccess' ) True >>> resourceStatus.isUsableStorageElement( 'CERN-ARCHIVE', 'ReadAccess' ) False # May be banned >>> resourceStatus.isUsableStorageElement( 'CERN-USER', None ) False >>> resourceStatus.isUsableStorageElement( 'RubbishCE', 'ReadAccess' ) False >>> resourceStatus.isUsableStorageElement( 'CERN-USER', 'RubbishAccess' ) False :Parameters: **seName** - `string` name of the storage element to be matched **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.isUsableElement('StorageElement', seName, statusType) def getUsableStorageElements(self, statusType): """ For a given statusType, returns all storage elements that are usable: their status for that particular statusType is either Active or Degraded; in a list. examples >>> resourceStatus.getUsableStorageElements( 'ReadAccess' ) S_OK( [ 'CERN-USER', 'PIC-USER',... ] ) >>> resourceStatus.getUsableStorageElements( None ) S_ERROR( ... ) >>> resourceStatus.getUsableStorageElements( 'RubbishAccess' ) S_ERROR( ... ) :Parameters: **statusType** - `string` name of the statusType to be matched :return: S_OK() || S_ERROR() """ return self.getUsableElements('StorageElement', statusType) #............................................................................. # Private methods def __getSiteAccess(self, resourceType, elementName, siteAccess): """ Method that given a resourceType and an elementName, finds the site name that owes it. Once that is done, the site access <siteAccess> is checked and returned. :Parameters: **resourceType** - `string` name of the resource type ( StorageElement, ComputingElement.. ) **elementName** - `string` name of the resource of type <resourceType> **siteAccess** - `string` site access ( StorageAccess, ComputingAccess .. ) :return: S_OK() || S_ERROR() """ siteName = Resources.getSiteForResource(resourceType, elementName) if not siteName['OK']: return siteName siteName = siteName['Value'] if not self.siteStatus.isUsableSite(siteName, siteAccess): return S_ERROR('Site %s is not usable for Computing' % siteName) return S_OK() #............................................................................. #............................................................................. #............................................................................. #............................................................................. # Old code, to be deleted / refactored soon. # def getStorageElementStatus( self, elementName, statusType = None ): # """ # Helper with dual access, tries to get information from the RSS for the given # StorageElement, otherwise, it gets it from the CS. # # example: # >>> getStorageElementStatus( 'CERN-USER', 'Read' ) # S_OK( { 'CERN-USER' : { 'Read': 'Active' } } ) # >>> getStorageElementStatus( 'CERN-USER', 'Write' ) # S_OK( { 'CERN-USER' : {'Read': 'Active', 'Write': 'Active', 'Check': 'Banned', 'Remove': 'Banned'}} ) # >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' ) # S_ERROR( xyz.. ) # >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' ) # S_OK( 'Unknown' ) # # """ # # if self.__getMode(): # # We do not apply defaults. If is not on the cache, S_ERROR is returned. # return self.__getRSSStorageElementStatus( elementName, statusType ) # else: # return self.__getCSStorageElementStatus( elementName, statusType ) # FIXME: to be deleted !!! ONLY RSS ( scripts, agents and web portal ) should set statuses # def setStorageElementStatus( self, elementName, statusType, status, reason = None, # tokenOwner = None ): # # """ # Helper with dual access, tries set information in RSS and in CS. # # example: # >>> getStorageElementStatus( 'CERN-USER', 'Read' ) # S_OK( { 'Read': 'Active' } ) # >>> getStorageElementStatus( 'CERN-USER', 'Write' ) # S_OK( {'Read': 'Active', 'Write': 'Active', 'Check': 'Banned', 'Remove': 'Banned'} ) # >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType' ) # S_ERROR( xyz.. ) # >>> getStorageElementStatus( 'CERN-USER', 'ThisIsAWrongStatusType', 'Unknown' ) # S_OK( 'Unknown' ) # """ # # #if self.__getMode(): # #return self.__setRSSStorageElementStatus( elementName, statusType, status, reason, tokenOwner ) # #else: # # return self.__setCSStorageElementStatus( elementName, statusType, status ) #............................................................................. # update Cache methods def _updateCECache(self): """ Method used to update the ComputingElementCache. """ return self.__updateCache('ComputingElement') def _updateSECache(self): """ Method used to update the StorageElementCache. """ return self.__updateCache('StorageElement') def __updateCache(self, elementType): meta = {'columns': ['Name', 'StatusType', 'Status']} rawCache = self.rssClient.selectStatusElement('Resource', 'Status', elementType=elementType, meta=meta) if not rawCache['OK']: return rawCache return S_OK(self.getCacheDictFromRawData(rawCache['Value'])) #............................................................................. #............................................................................. #............................................................................. #............................................................................. # TODO : delete all this # def __getRSSStorageElementStatus( self, elementName, statusType ): # """ # Gets from the cache or the RSS the StorageElements status. The cache is a # copy of the DB table. If it is not on the cache, most likely is not going # to be on the DB. # # There is one exception: item just added to the CS, e.g. new StorageElement. # The period between it is added to the DB and the changes are propagated # to the cache will be inconsisten, but not dangerous. Just wait <cacheLifeTime> # minutes. # """ # # siteAccess = self.__getSiteAccess( 'StorageElement', elementName, 'StorageAccess' ) # if not siteAccess[ 'OK' ]: # self.log.error( siteAccess[ 'Message' ] ) # return siteAccess # # cacheMatch = self.seCache.match( elementName, statusType ) # # self.log.debug( '__getRSSStorageElementStatus' ) # self.log.debug( cacheMatch ) # # return cacheMatch # def __getCSStorageElementStatus( self, elementName, statusType, default = None ): # """ # Gets from the CS the StorageElements status # """ # # cs_path = "/Resources/StorageElements" # # if not isinstance( elementName, list ): # elementName = [ elementName ] # # statuses = self.rssConfig.getConfigStatusType( 'StorageElement' ) # # result = {} # for element in elementName: # # if statusType is not None: # # Added Active by default # res = gConfig.getOption( "%s/%s/%s" % ( cs_path, element, statusType ), 'Active' ) # if res[ 'OK' ] and res[ 'Value' ]: # result[ element ] = { statusType : res[ 'Value' ] } # # else: # res = gConfig.getOptionsDict( "%s/%s" % ( cs_path, element ) ) # if res[ 'OK' ] and res[ 'Value' ]: # elementStatuses = {} # for elementStatusType, value in res[ 'Value' ].items(): # if elementStatusType in statuses: # elementStatuses[ elementStatusType ] = value # # # If there is no status defined in the CS, we add by default Read and # # Write as Active. # if elementStatuses == {}: # elementStatuses = { 'ReadAccess' : 'Active', 'WriteAccess' : 'Active' } # # result[ element ] = elementStatuses # # if result: # return S_OK( result ) # # if default is not None: # # # sec check # if statusType is None: # statusType = 'none' # # defList = [ [ el, statusType, default ] for el in elementName ] # return S_OK( getDictFromList( defList ) ) # # _msg = "StorageElement '%s', with statusType '%s' is unknown for CS." # return S_ERROR( _msg % ( elementName, statusType ) ) # def __setRSSStorageElementStatus( self, elementName, statusType, status, reason, tokenOwner ): # """ # Sets on the RSS the StorageElements status # """ # # expiration = datetime.datetime.utcnow() + datetime.timedelta( days = 1 ) # # self.seCache.acquireLock() # try: # res = self.rssClient.modifyStatusElement( 'Resource', 'Status', name = elementName, # statusType = statusType, status = status, # reason = reason, tokenOwner = tokenOwner, # tokenExpiration = expiration ) # if res[ 'OK' ]: # self.seCache.refreshCache() # # if not res[ 'OK' ]: # _msg = 'Error updating StorageElement (%s,%s,%s)' % ( elementName, statusType, status ) # gLogger.warn( 'RSS: %s' % _msg ) # # return res # # finally: # # Release lock, no matter what. # self.seCache.releaseLock() # def __setCSStorageElementStatus( self, elementName, statusType, status ): # """ # Sets on the CS the StorageElements status # """ # # statuses = self.rssConfig.getConfigStatusType( 'StorageElement' ) # if not statusType in statuses: # gLogger.error( "%s is not a valid statusType" % statusType ) # return S_ERROR( "%s is not a valid statusType: %s" % ( statusType, statuses ) ) # # csAPI = CSAPI() # # cs_path = "/Resources/StorageElements" # # csAPI.setOption( "%s/%s/%s" % ( cs_path, elementName, statusType ), status ) # # res = csAPI.commitChanges() # if not res[ 'OK' ]: # gLogger.warn( 'CS: %s' % res[ 'Message' ] ) # # return res # def __getMode( self ): # """ # Get's flag defined ( or not ) on the RSSConfiguration. If defined as 1, # we use RSS, if not, we use CS. # """ # # res = self.rssConfig.getConfigState() # # if res == 'Active': # # if self.rssClient is None: # self.rssClient = ResourceStatusClient() # return True # # self.rssClient = None # return False ################################################################################ #def getDictFromList( fromList ): # ''' # Auxiliary method that given a list returns a dictionary of dictionaries: # { site1 : { statusType1 : st1, statusType2 : st2 }, ... } # ''' # # res = {} # for listElement in fromList: # site, sType, status = listElement # if not res.has_key( site ): # res[ site ] = {} # res[ site ][ sType ] = status # return res #def getCacheDictFromRawData( rawList ): # """ # Formats the raw data list, which we know it must have tuples of three elements. # ( element1, element2, element3 ) into a list of tuples with the format # ( ( element1, element2 ), element3 ). Then, it is converted to a dictionary, # which will be the new Cache. # # It happens that element1 is elementName, element2 is statusType and element3 # is status. # # :Parameters: # **rawList** - `list` # list of three element tuples [( element1, element2, element3 ),... ] # # :return: dict of the form { ( elementName, statusType ) : status, ... } # """ # # res = [ ( ( name, sType ), status ) for name, sType, status in rawList ] # return dict( res ) ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
def getPilotSummaryWeb( self, selectDict, sortList, startItem, maxItems ): """ Get summary of the pilot jobs status by CE/site in a standard structure """ stateNames = ['Submitted', 'Ready', 'Scheduled', 'Waiting', 'Running', 'Done', 'Aborted'] allStateNames = stateNames + ['Done_Empty', 'Aborted_Hour'] paramNames = ['Site', 'CE'] + allStateNames resultDict = {} last_update = None if selectDict.has_key( 'LastUpdateTime' ): last_update = selectDict['LastUpdateTime'] del selectDict['LastUpdateTime'] site_select = [] if selectDict.has_key( 'GridSite' ): site_select = selectDict['GridSite'] if type( site_select ) != type( [] ): site_select = [site_select] del selectDict['GridSite'] status_select = [] if selectDict.has_key( 'Status' ): status_select = selectDict['Status'] if type( status_select ) != type( [] ): status_select = [status_select] del selectDict['Status'] expand_site = '' if selectDict.has_key( 'ExpandSite' ): expand_site = selectDict['ExpandSite'] site_select = [expand_site] del selectDict['ExpandSite'] start = time.time() # Get all the data from the database with various selections result = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not result['OK']: return result last_update = Time.dateTime() - Time.hour selectDict['Status'] = 'Aborted' resultHour = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultHour['OK']: return resultHour last_update = Time.dateTime() - Time.day selectDict['Status'] = ['Aborted', 'Done'] resultDay = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultDay['OK']: return resultDay selectDict['CurrentJobID'] = 0 selectDict['Status'] = 'Done' resultDayEmpty = self.getCounters( 'PilotAgents', ['GridSite', 'DestinationSite', 'Status'], selectDict, newer = last_update, timeStamp = 'LastUpdateTime' ) if not resultDayEmpty['OK']: return resultDayEmpty ceMap = {} resMap = getCESiteMapping() if resMap['OK']: ceMap = resMap['Value'] # Sort out different counters resultDict = {} resultDict['Unknown'] = {} for attDict, count in result['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ce != "Multiple" and ceMap.has_key( ce ): site = ceMap[ce] if not resultDict.has_key( site ): resultDict[site] = {} if not resultDict[site].has_key( ce ): resultDict[site][ce] = {} for p in allStateNames: resultDict[site][ce][p] = 0 resultDict[site][ce][state] = count for attDict, count in resultDay['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done"] = count if state == "Aborted": resultDict[site][ce]["Aborted"] = count for attDict, count in resultDayEmpty['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Done": resultDict[site][ce]["Done_Empty"] = count for attDict, count in resultHour['Value']: site = attDict['GridSite'] ce = attDict['DestinationSite'] state = attDict['Status'] if site == 'Unknown' and ce != "Unknown" and ceMap.has_key( ce ): site = ceMap[ce] if state == "Aborted": resultDict[site][ce]["Aborted_Hour"] = count records = [] siteSumDict = {} for site in resultDict: sumDict = {} for state in allStateNames: if not sumDict.has_key( state ): sumDict[state] = 0 sumDict['Total'] = 0 for ce in resultDict[site]: itemList = [site, ce] total = 0 for state in allStateNames: itemList.append( resultDict[site][ce][state] ) sumDict[state] += resultDict[site][ce][state] if state == "Done": done = resultDict[site][ce][state] if state == "Done_Empty": empty = resultDict[site][ce][state] if state == "Aborted": aborted = resultDict[site][ce][state] if state == "Aborted_Hour": aborted_hour = resultDict[site][ce][state] if state != "Aborted_Hour" and state != "Done_Empty": total += resultDict[site][ce][state] sumDict['Total'] += total # Add the total number of pilots seen in the last day itemList.append( total ) # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append( '%.2f' % eff ) # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. itemList.append( '%.2f' % eff ) # Evaluate the quality status of the CE if total > 10: if eff < 25.: itemList.append( 'Bad' ) elif eff < 60.: itemList.append( 'Poor' ) elif eff < 85.: itemList.append( 'Fair' ) else: itemList.append( 'Good' ) else: itemList.append( 'Idle' ) if len( resultDict[site] ) == 1 or expand_site: records.append( itemList ) if len( resultDict[site] ) > 1 and not expand_site: itemList = [site, 'Multiple'] for state in allStateNames + ['Total']: if sumDict.has_key( state ): itemList.append( sumDict[state] ) else: itemList.append( 0 ) done = sumDict["Done"] empty = sumDict["Done_Empty"] aborted = sumDict["Aborted"] aborted_hour = sumDict["Aborted_Hour"] total = sumDict["Total"] # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. itemList.append( '%.2f' % eff ) # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. itemList.append( '%.2f' % eff ) # Evaluate the quality status of the Site if total > 10: if eff < 25.: itemList.append( 'Bad' ) elif eff < 60.: itemList.append( 'Poor' ) elif eff < 85.: itemList.append( 'Fair' ) else: itemList.append( 'Good' ) else: itemList.append( 'Idle' ) records.append( itemList ) for state in allStateNames + ['Total']: if not siteSumDict.has_key( state ): siteSumDict[state] = sumDict[state] else: siteSumDict[state] += sumDict[state] # Perform site selection if site_select: new_records = [] for r in records: if r[0] in site_select: new_records.append( r ) records = new_records # Perform status selection if status_select: new_records = [] for r in records: if r[14] in status_select: new_records.append( r ) records = new_records # Get the Site Mask data siteStatus = SiteStatus() for r in records: # #FIXME: using only ComputingAccess # if siteStatus.isUsableSite( r[0], 'ComputingAccess' ): r.append('Yes') else: r.append('No') finalDict = {} finalDict['TotalRecords'] = len( records ) finalDict['ParameterNames'] = paramNames + \ ['Total', 'PilotsPerJob', 'PilotJobEff', 'Status', 'InMask'] # Return all the records if maxItems == 0 or the specified number otherwise if maxItems: finalDict['Records'] = records[startItem:startItem + maxItems] else: finalDict['Records'] = records done = siteSumDict["Done"] empty = siteSumDict["Done_Empty"] aborted = siteSumDict["Aborted"] aborted_hour = siteSumDict["Aborted_Hour"] total = siteSumDict["Total"] # Add pilot submission efficiency evaluation if ( done - empty ) > 0: eff = float( done ) / float( done - empty ) elif done == 0: eff = 0. elif empty == done: eff = 99. else: eff = 0. siteSumDict['PilotsPerJob'] = '%.2f' % eff # Add pilot job efficiency evaluation if total > 0: eff = float( total - aborted ) / float( total ) * 100. else: eff = 100. siteSumDict['PilotJobEff'] = '%.2f' % eff # Evaluate the overall quality status if total > 100: if eff < 25.: siteSumDict['Status'] = 'Bad' elif eff < 60.: siteSumDict['Status'] = 'Poor' elif eff < 85.: siteSumDict['Status'] = 'Fair' else: siteSumDict['Status'] = 'Good' else: siteSumDict['Status'] = 'Idle' finalDict['Extras'] = siteSumDict return S_OK( finalDict )